diff options
| author | Ingo Molnar <mingo@kernel.org> | 2024-03-25 11:32:29 +0100 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2024-03-25 11:32:29 +0100 |
| commit | f4566a1e73957800df75a3dd2dccee8a4697f327 (patch) | |
| tree | b043b875228c0b25988af66c680d60cae69d761d /kernel | |
| parent | sched/fair: Fix typos in comments (diff) | |
| parent | Linux 6.9-rc1 (diff) | |
| download | linux-f4566a1e73957800df75a3dd2dccee8a4697f327.tar.gz linux-f4566a1e73957800df75a3dd2dccee8a4697f327.zip | |
Merge tag 'v6.9-rc1' into sched/core, to pick up fixes and to refresh the branch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
103 files changed, 7136 insertions, 3050 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 946dffa048b7..6c34e63c88ff 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -2,11 +2,13 @@ menu "Kexec and crash features" -config CRASH_CORE +config CRASH_RESERVE + bool + +config VMCORE_INFO bool config KEXEC_CORE - select CRASH_CORE bool config KEXEC_ELF @@ -95,9 +97,11 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" + default y depends on ARCH_SUPPORTS_CRASH_DUMP - select CRASH_CORE - select KEXEC_CORE + depends on KEXEC_CORE + select VMCORE_INFO + select CRASH_RESERVE help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels diff --git a/kernel/Makefile b/kernel/Makefile index ce105a5558fc..3c13240dfc9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,8 +68,10 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o +obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o @@ -120,7 +122,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o diff --git a/kernel/audit.c b/kernel/audit.c index 9c8e5f732c4c..e7a62ebbf4d1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1693,9 +1693,7 @@ static int __init audit_init(void) if (audit_initialized == AUDIT_DISABLED) return 0; - audit_buffer_cache = kmem_cache_create("audit_buffer", - sizeof(struct audit_buffer), - 0, SLAB_PANIC, NULL); + audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8317a37dea0b..be8c680121e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -788,7 +788,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) static inline int audit_dupe_lsm_field(struct audit_field *df, struct audit_field *sf) { - int ret = 0; + int ret; char *lsm_str; /* our own copy of lsm_str */ diff --git a/kernel/bounds.c b/kernel/bounds.c index b529182e8b04..c5a9fcd2d622 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 6a906ff93006..bc25f5098a25 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,6 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool + select CRYPTO_LIB_SHA1 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f526b7573e97..368c5d86b5b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o @@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) +obj-$(CONFIG_BPF_SYSCALL) += arena.o +endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c new file mode 100644 index 000000000000..86571e760dd6 --- /dev/null +++ b/kernel/bpf/arena.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/btf_ids.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> + +/* + * bpf_arena is a sparsely populated shared memory region between bpf program and + * user space process. + * + * For example on x86-64 the values could be: + * user_vm_start 7f7d26200000 // picked by mmap() + * kern_vm_start ffffc90001e69000 // picked by get_vm_area() + * For user space all pointers within the arena are normal 8-byte addresses. + * In this example 7f7d26200000 is the address of the first page (pgoff=0). + * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr + * (u32)7f7d26200000 -> 26200000 + * hence + * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb + * kernel memory region. + * + * BPF JITs generate the following code to access arena: + * mov eax, eax // eax has lower 32-bit of user pointer + * mov word ptr [rax + r12 + off], bx + * where r12 == kern_vm_start and off is s16. + * Hence allocate 4Gb + GUARD_SZ/2 on each side. + * + * Initially kernel vm_area and user vma are not populated. + * User space can fault-in any address which will insert the page + * into kernel and user vma. + * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc + * which will insert it into kernel vm_area. + * The later fault-in from user space will populate that page into user vma. + */ + +/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ +#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) + +struct bpf_arena { + struct bpf_map map; + u64 user_vm_start; + u64 user_vm_end; + struct vm_struct *kern_vm; + struct maple_tree mt; + struct list_head vma_list; + struct mutex lock; +}; + +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; +} + +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return arena ? arena->user_vm_start : 0; +} + +static long arena_map_peek_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static long arena_map_pop_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_delete_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static long compute_pgoff(struct bpf_arena *arena, long uaddr) +{ + return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; +} + +static struct bpf_map *arena_map_alloc(union bpf_attr *attr) +{ + struct vm_struct *kern_vm; + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_arena *arena; + u64 vm_range; + int err = -ENOMEM; + + if (attr->key_size || attr->value_size || attr->max_entries == 0 || + /* BPF_F_MMAPABLE must be set */ + !(attr->map_flags & BPF_F_MMAPABLE) || + /* No unsupported flags present */ + (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) + return ERR_PTR(-EINVAL); + + if (attr->map_extra & ~PAGE_MASK) + /* If non-zero the map_extra is an expected user VMA start address */ + return ERR_PTR(-EINVAL); + + vm_range = (u64)attr->max_entries * PAGE_SIZE; + if (vm_range > (1ull << 32)) + return ERR_PTR(-E2BIG); + + if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) + /* user vma must not cross 32-bit boundary */ + return ERR_PTR(-ERANGE); + + kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); + if (!kern_vm) + return ERR_PTR(-ENOMEM); + + arena = bpf_map_area_alloc(sizeof(*arena), numa_node); + if (!arena) + goto err; + + arena->kern_vm = kern_vm; + arena->user_vm_start = attr->map_extra; + if (arena->user_vm_start) + arena->user_vm_end = arena->user_vm_start + vm_range; + + INIT_LIST_HEAD(&arena->vma_list); + bpf_map_init_from_attr(&arena->map, attr); + mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + mutex_init(&arena->lock); + + return &arena->map; +err: + free_vm_area(kern_vm); + return ERR_PTR(err); +} + +static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) +{ + struct page *page; + pte_t pte; + + pte = ptep_get(ptep); + if (!pte_present(pte)) /* sanity check */ + return 0; + page = pte_page(pte); + /* + * We do not update pte here: + * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug + * 2. TLB flushing is batched or deferred. Even if we clear pte, + * the TLB entries can stick around and continue to permit access to + * the freed page. So it all relies on 1. + */ + __free_page(page); + return 0; +} + +static void arena_map_free(struct bpf_map *map) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + /* + * Check that user vma-s are not around when bpf map is freed. + * mmap() holds vm_file which holds bpf_map refcnt. + * munmap() must have happened on vma followed by arena_vm_close() + * which would clear arena->vma_list. + */ + if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) + return; + + /* + * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). + * It unmaps everything from vmalloc area and clears pgtables. + * Call apply_to_existing_page_range() first to find populated ptes and + * free those pages. + */ + apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + free_vm_area(arena->kern_vm); + mtree_destroy(&arena->mt); + bpf_map_area_free(arena); +} + +static void *arena_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static long arena_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + return 0; +} + +static u64 arena_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + +struct vma_list { + struct vm_area_struct *vma; + struct list_head head; +}; + +static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) +{ + struct vma_list *vml; + + vml = kmalloc(sizeof(*vml), GFP_KERNEL); + if (!vml) + return -ENOMEM; + vma->vm_private_data = vml; + vml->vma = vma; + list_add(&vml->head, &arena->vma_list); + return 0; +} + +static void arena_vm_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct vma_list *vml; + + guard(mutex)(&arena->lock); + vml = vma->vm_private_data; + list_del(&vml->head); + vma->vm_private_data = NULL; + kfree(vml); +} + +#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ + +static vm_fault_t arena_vm_fault(struct vm_fault *vmf) +{ + struct bpf_map *map = vmf->vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct page *page; + long kbase, kaddr; + int ret; + + kbase = bpf_arena_get_kern_vm_start(arena); + kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + + guard(mutex)(&arena->lock); + page = vmalloc_to_page((void *)kaddr); + if (page) + /* already have a page vmap-ed */ + goto out; + + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) + /* User space requested to segfault when page is not allocated by bpf prog */ + return VM_FAULT_SIGSEGV; + + ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + if (ret) + return VM_FAULT_SIGSEGV; + + /* Account into memcg of the process that created bpf_arena */ + ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + return VM_FAULT_SIGSEGV; + } + + ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + __free_page(page); + return VM_FAULT_SIGSEGV; + } +out: + page_ref_add(page, 1); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct arena_vm_ops = { + .close = arena_vm_close, + .fault = arena_vm_fault, +}; + +static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + long ret; + + if (pgoff) + return -EINVAL; + if (len > (1ull << 32)) + return -E2BIG; + + /* if user_vm_start was specified at arena creation time */ + if (arena->user_vm_start) { + if (len > arena->user_vm_end - arena->user_vm_start) + return -E2BIG; + if (len != arena->user_vm_end - arena->user_vm_start) + return -EINVAL; + if (addr != arena->user_vm_start) + return -EINVAL; + } + + ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + if (IS_ERR_VALUE(ret)) + return ret; + if ((ret >> 32) == ((ret + len - 1) >> 32)) + return ret; + if (WARN_ON_ONCE(arena->user_vm_start)) + /* checks at map creation time should prevent this */ + return -EFAULT; + return round_up(ret, 1ull << 32); +} + +static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + guard(mutex)(&arena->lock); + if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) + /* + * If map_extra was not specified at arena creation time then + * 1st user process can do mmap(NULL, ...) to pick user_vm_start + * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); + * or + * specify addr in map_extra and + * use the same addr later with mmap(addr, MAP_FIXED..); + */ + return -EBUSY; + + if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) + /* all user processes must have the same size of mmap-ed region */ + return -EBUSY; + + /* Earlier checks should prevent this */ + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + return -EFAULT; + + if (remember_vma(arena, vma)) + return -ENOMEM; + + arena->user_vm_start = vma->vm_start; + arena->user_vm_end = vma->vm_end; + /* + * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and + * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid + * potential change of user_vm_start. + */ + vm_flags_set(vma, VM_DONTEXPAND); + vma->vm_ops = &arena_vm_ops; + return 0; +} + +static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if ((u64)off > arena->user_vm_end - arena->user_vm_start) + return -ERANGE; + *imm = (unsigned long)arena->user_vm_start; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) +const struct bpf_map_ops arena_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = arena_map_alloc, + .map_free = arena_map_free, + .map_direct_value_addr = arena_map_direct_value_addr, + .map_mmap = arena_map_mmap, + .map_get_unmapped_area = arena_get_unmapped_area, + .map_get_next_key = arena_map_get_next_key, + .map_push_elem = arena_map_push_elem, + .map_peek_elem = arena_map_peek_elem, + .map_pop_elem = arena_map_pop_elem, + .map_lookup_elem = arena_map_lookup_elem, + .map_update_elem = arena_map_update_elem, + .map_delete_elem = arena_map_delete_elem, + .map_check_btf = arena_map_check_btf, + .map_mem_usage = arena_map_mem_usage, + .map_btf_id = &bpf_arena_map_btf_ids[0], +}; + +static u64 clear_lo32(u64 val) +{ + return val & ~(u64)~0U; +} + +/* + * Allocate pages and vmap them into kernel vmalloc area. + * Later the pages will be mmaped into user space vma. + */ +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +{ + /* user_vm_end/start are fixed before bpf prog runs */ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct page **pages; + long pgoff = 0; + u32 uaddr32; + int ret, i; + + if (page_cnt > page_cnt_max) + return 0; + + if (uaddr) { + if (uaddr & ~PAGE_MASK) + return 0; + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + /* requested address will be outside of user VMA */ + return 0; + } + + /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + guard(mutex)(&arena->lock); + + if (uaddr) + ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, + MT_ENTRY, GFP_KERNEL); + else + ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, + page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (ret) + goto out_free_pages; + + ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, + node_id, page_cnt, pages); + if (ret) + goto out; + + uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); + /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, + kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); + if (ret) { + for (i = 0; i < page_cnt; i++) + __free_page(pages[i]); + goto out; + } + kvfree(pages); + return clear_lo32(arena->user_vm_start) + uaddr32; +out: + mtree_erase(&arena->mt, pgoff); +out_free_pages: + kvfree(pages); + return 0; +} + +/* + * If page is present in vmalloc area, unmap it from vmalloc area, + * unmap it from all user space vma-s, + * and free it. + */ +static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + struct vma_list *vml; + + list_for_each_entry(vml, &arena->vma_list, head) + zap_page_range_single(vml->vma, uaddr, + PAGE_SIZE * page_cnt, NULL); +} + +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + u64 full_uaddr, uaddr_end; + long kaddr, pgoff, i; + struct page *page; + + /* only aligned lower 32-bit are relevant */ + uaddr = (u32)uaddr; + uaddr &= PAGE_MASK; + full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; + uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); + if (full_uaddr >= uaddr_end) + return; + + page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + + guard(mutex)(&arena->lock); + + pgoff = compute_pgoff(arena, uaddr); + /* clear range */ + mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + + if (page_cnt > 1) + /* bulk zap if multiple pages being freed */ + zap_pages(arena, full_uaddr, page_cnt); + + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; + for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { + page = vmalloc_to_page((void *)kaddr); + if (!page) + continue; + if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + zap_pages(arena, full_uaddr, 1); + vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); + __free_page(page); + } +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); +} + +__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt); +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(arena_kfuncs) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(arena_kfuncs) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &arena_kfuncs, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); +} +late_initcall(kfunc_init); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0bdbbbeab155..13358675ff2e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool bypass_spec_v1 = bpf_bypass_spec_v1(); + bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0fae79164187..112581cf97e7 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -548,7 +548,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -ENOENT; /* Only allow sleepable program for resched-able iterator */ - if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo)) + if (prog->sleepable && !bpf_iter_target_support_resched(tinfo)) return -EINVAL; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); @@ -697,7 +697,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; - if (prog->aux->sleepable) { + if (prog->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 146824cc9689..bdea1a459153 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -414,47 +414,21 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bpf_selem_unlink_storage(selem, reuse_now); } -/* If cacheit_lockit is false, this lookup function is lockless */ -struct bpf_local_storage_data * -bpf_local_storage_lookup(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool cacheit_lockit) +void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { - struct bpf_local_storage_data *sdata; - struct bpf_local_storage_elem *selem; - - /* Fast path (cache hit) */ - sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], - bpf_rcu_lock_held()); - if (sdata && rcu_access_pointer(sdata->smap) == smap) - return sdata; - - /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &local_storage->list, snode, - rcu_read_lock_trace_held()) - if (rcu_access_pointer(SDATA(selem)->smap) == smap) - break; - - if (!selem) - return NULL; - - sdata = SDATA(selem); - if (cacheit_lockit) { - unsigned long flags; - - /* spinlock is needed to avoid racing with the - * parallel delete. Otherwise, publishing an already - * deleted sdata to the cache will become a use-after-free - * problem in the next bpf_local_storage_lookup(). - */ - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (selem_linked_to_storage(selem)) - rcu_assign_pointer(local_storage->cache[smap->cache_idx], - sdata); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - } + unsigned long flags; - return sdata; + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next bpf_local_storage_lookup(). + */ + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (selem_linked_to_storage(selem)) + rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e8e910395bf6..68240c3c6e7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -260,9 +260,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) BTF_ID(func, bpf_lsm_bpf_map) -BTF_ID(func, bpf_lsm_bpf_map_alloc_security) -BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_map_create) +BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bpf_prog_load) +BTF_ID(func, bpf_lsm_bpf_prog_free) +BTF_ID(func, bpf_lsm_bpf_token_create) +BTF_ID(func, bpf_lsm_bpf_token_free) +BTF_ID(func, bpf_lsm_bpf_token_cmd) +BTF_ID(func, bpf_lsm_bpf_token_capable) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) @@ -276,10 +282,6 @@ BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) BTF_ID(func, bpf_lsm_file_receive) -#ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_inet_conn_established) -#endif /* CONFIG_SECURITY_NETWORK */ - BTF_ID(func, bpf_lsm_inode_create) BTF_ID(func, bpf_lsm_inode_free_security) BTF_ID(func, bpf_lsm_inode_getattr) @@ -330,6 +332,8 @@ BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) #ifdef CONFIG_SECURITY_NETWORK +BTF_ID(func, bpf_lsm_inet_conn_established) + BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) @@ -357,9 +361,8 @@ BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) -BTF_ID(func, bpf_lsm_bpf_map_free_security) -BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) -BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_bpf_map_free) +BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) #ifdef CONFIG_SECURITY_NETWORK diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 02068bd0e4d9..43356faaa057 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,26 +13,17 @@ #include <linux/btf_ids.h> #include <linux/rcupdate_wait.h> -enum bpf_struct_ops_state { - BPF_STRUCT_OPS_STATE_INIT, - BPF_STRUCT_OPS_STATE_INUSE, - BPF_STRUCT_OPS_STATE_TOBEFREE, - BPF_STRUCT_OPS_STATE_READY, -}; - -#define BPF_STRUCT_OPS_COMMON_VALUE \ - refcount_t refcnt; \ - enum bpf_struct_ops_state state - struct bpf_struct_ops_value { - BPF_STRUCT_OPS_COMMON_VALUE; + struct bpf_struct_ops_common_value common; char data[] ____cacheline_aligned_in_smp; }; +#define MAX_TRAMP_IMAGE_PAGES 8 + struct bpf_struct_ops_map { struct bpf_map map; struct rcu_head rcu; - const struct bpf_struct_ops *st_ops; + const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; /* link has all the bpf_links that is populated @@ -40,12 +31,14 @@ struct bpf_struct_ops_map { * (in kvalue.data). */ struct bpf_link **links; - /* image is a page that has all the trampolines + u32 links_cnt; + u32 image_pages_cnt; + /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. - * A PAGE_SIZE "image" is enough to store all trampoline for - * "links[]". */ - void *image; + void *image_pages[MAX_TRAMP_IMAGE_PAGES]; + /* The owner moduler's btf. */ + struct btf *btf; /* uvalue->data stores the kernel struct * (e.g. tcp_congestion_ops) that is more useful * to userspace than the kvalue. For example, @@ -70,35 +63,6 @@ static DEFINE_MUTEX(update_mutex); #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) -/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is - * the map's value exposed to the userspace and its btf-type-id is - * stored at the map->btf_vmlinux_value_type_id. - * - */ -#define BPF_STRUCT_OPS_TYPE(_name) \ -extern struct bpf_struct_ops bpf_##_name; \ - \ -struct bpf_struct_ops_##_name { \ - BPF_STRUCT_OPS_COMMON_VALUE; \ - struct _name data ____cacheline_aligned_in_smp; \ -}; -#include "bpf_struct_ops_types.h" -#undef BPF_STRUCT_OPS_TYPE - -enum { -#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, -#include "bpf_struct_ops_types.h" -#undef BPF_STRUCT_OPS_TYPE - __NR_BPF_STRUCT_OPS_TYPE, -}; - -static struct bpf_struct_ops * const bpf_struct_ops[] = { -#define BPF_STRUCT_OPS_TYPE(_name) \ - [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, -#include "bpf_struct_ops_types.h" -#undef BPF_STRUCT_OPS_TYPE -}; - const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { }; @@ -108,138 +72,355 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = { #endif }; -static const struct btf_type *module_type; +BTF_ID_LIST(st_ops_ids) +BTF_ID(struct, module) +BTF_ID(struct, bpf_struct_ops_common_value) + +enum { + IDX_MODULE_ID, + IDX_ST_OPS_COMMON_VALUE_ID, +}; + +extern struct btf *btf_vmlinux; -void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) +static bool is_valid_value_type(struct btf *btf, s32 value_id, + const struct btf_type *type, + const char *value_name) { - s32 type_id, value_id, module_id; + const struct btf_type *common_value_type; const struct btf_member *member; - struct bpf_struct_ops *st_ops; - const struct btf_type *t; - char value_name[128]; - const char *mname; - u32 i, j; + const struct btf_type *vt, *mt; - /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ -#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); -#include "bpf_struct_ops_types.h" -#undef BPF_STRUCT_OPS_TYPE + vt = btf_type_by_id(btf, value_id); + if (btf_vlen(vt) != 2) { + pr_warn("The number of %s's members should be 2, but we get %d\n", + value_name, btf_vlen(vt)); + return false; + } + member = btf_type_member(vt); + mt = btf_type_by_id(btf, member->type); + common_value_type = btf_type_by_id(btf_vmlinux, + st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); + if (mt != common_value_type) { + pr_warn("The first member of %s should be bpf_struct_ops_common_value\n", + value_name); + return false; + } + member++; + mt = btf_type_by_id(btf, member->type); + if (mt != type) { + pr_warn("The second member of %s should be %s\n", + value_name, btf_name_by_offset(btf, type->name_off)); + return false; + } - module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); - if (module_id < 0) { - pr_warn("Cannot find struct module in btf_vmlinux\n"); - return; + return true; +} + +static void *bpf_struct_ops_image_alloc(void) +{ + void *image; + int err; + + err = bpf_jit_charge_modmem(PAGE_SIZE); + if (err) + return ERR_PTR(err); + image = arch_alloc_bpf_trampoline(PAGE_SIZE); + if (!image) { + bpf_jit_uncharge_modmem(PAGE_SIZE); + return ERR_PTR(-ENOMEM); } - module_type = btf_type_by_id(btf, module_id); - for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { - st_ops = bpf_struct_ops[i]; + return image; +} - if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= - sizeof(value_name)) { - pr_warn("struct_ops name %s is too long\n", - st_ops->name); - continue; - } - sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); +void bpf_struct_ops_image_free(void *image) +{ + if (image) { + arch_free_bpf_trampoline(image, PAGE_SIZE); + bpf_jit_uncharge_modmem(PAGE_SIZE); + } +} + +#define MAYBE_NULL_SUFFIX "__nullable" +#define MAX_STUB_NAME 128 - value_id = btf_find_by_name_kind(btf, value_name, - BTF_KIND_STRUCT); - if (value_id < 0) { - pr_warn("Cannot find struct %s in btf_vmlinux\n", - value_name); +/* Return the type info of a stub function, if it exists. + * + * The name of a stub function is made up of the name of the struct_ops and + * the name of the function pointer member, separated by "__". For example, + * if the struct_ops type is named "foo_ops" and the function pointer + * member is named "bar", the stub function name would be "foo_ops__bar". + */ +static const struct btf_type * +find_stub_func_proto(const struct btf *btf, const char *st_op_name, + const char *member_name) +{ + char stub_func_name[MAX_STUB_NAME]; + const struct btf_type *func_type; + s32 btf_id; + int cp; + + cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", + st_op_name, member_name); + if (cp >= MAX_STUB_NAME) { + pr_warn("Stub function name too long\n"); + return NULL; + } + btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); + if (btf_id < 0) + return NULL; + func_type = btf_type_by_id(btf, btf_id); + if (!func_type) + return NULL; + + return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ +} + +/* Prepare argument info for every nullable argument of a member of a + * struct_ops type. + * + * Initialize a struct bpf_struct_ops_arg_info according to type info of + * the arguments of a stub function. (Check kCFI for more information about + * stub functions.) + * + * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info + * to provide an array of struct bpf_ctx_arg_aux, which in turn provides + * the information that used by the verifier to check the arguments of the + * BPF struct_ops program assigned to the member. Here, we only care about + * the arguments that are marked as __nullable. + * + * The array of struct bpf_ctx_arg_aux is eventually assigned to + * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the + * verifier. (See check_struct_ops_btf_id()) + * + * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If + * fails, it will be kept untouched. + */ +static int prepare_arg_info(struct btf *btf, + const char *st_ops_name, + const char *member_name, + const struct btf_type *func_proto, + struct bpf_struct_ops_arg_info *arg_info) +{ + const struct btf_type *stub_func_proto, *pointed_type; + const struct btf_param *stub_args, *args; + struct bpf_ctx_arg_aux *info, *info_buf; + u32 nargs, arg_no, info_cnt = 0; + u32 arg_btf_id; + int offset; + + stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); + if (!stub_func_proto) + return 0; + + /* Check if the number of arguments of the stub function is the same + * as the number of arguments of the function pointer. + */ + nargs = btf_type_vlen(func_proto); + if (nargs != btf_type_vlen(stub_func_proto)) { + pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", + st_ops_name, member_name, member_name, st_ops_name); + return -EINVAL; + } + + if (!nargs) + return 0; + + args = btf_params(func_proto); + stub_args = btf_params(stub_func_proto); + + info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); + if (!info_buf) + return -ENOMEM; + + /* Prepare info for every nullable argument */ + info = info_buf; + for (arg_no = 0; arg_no < nargs; arg_no++) { + /* Skip arguments that is not suffixed with + * "__nullable". + */ + if (!btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX)) continue; + + /* Should be a pointer to struct */ + pointed_type = btf_type_resolve_ptr(btf, + args[arg_no].type, + &arg_btf_id); + if (!pointed_type || + !btf_type_is_struct(pointed_type)) { + pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", + st_ops_name, member_name, MAYBE_NULL_SUFFIX); + goto err_out; } - type_id = btf_find_by_name_kind(btf, st_ops->name, - BTF_KIND_STRUCT); - if (type_id < 0) { - pr_warn("Cannot find struct %s in btf_vmlinux\n", - st_ops->name); - continue; + offset = btf_ctx_arg_offset(btf, func_proto, arg_no); + if (offset < 0) { + pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", + st_ops_name, member_name, arg_no); + goto err_out; } - t = btf_type_by_id(btf, type_id); - if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { - pr_warn("Cannot support #%u members in struct %s\n", - btf_type_vlen(t), st_ops->name); - continue; + + if (args[arg_no].type != stub_args[arg_no].type) { + pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", + arg_no, st_ops_name, member_name); + goto err_out; } - for_each_member(j, t, member) { - const struct btf_type *func_proto; + /* Fill the information of the new argument */ + info->reg_type = + PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + info->btf_id = arg_btf_id; + info->btf = btf; + info->offset = offset; - mname = btf_name_by_offset(btf, member->name_off); - if (!*mname) { - pr_warn("anon member in struct %s is not supported\n", - st_ops->name); - break; - } + info++; + info_cnt++; + } - if (__btf_member_bitfield_size(t, member)) { - pr_warn("bit field member %s in struct %s is not supported\n", - mname, st_ops->name); - break; - } + if (info_cnt) { + arg_info->info = info_buf; + arg_info->cnt = info_cnt; + } else { + kfree(info_buf); + } - func_proto = btf_type_resolve_func_ptr(btf, - member->type, - NULL); - if (func_proto && - btf_distill_func_proto(log, btf, - func_proto, mname, - &st_ops->func_models[j])) { - pr_warn("Error in parsing func ptr %s in struct %s\n", - mname, st_ops->name); - break; - } - } + return 0; - if (j == btf_type_vlen(t)) { - if (st_ops->init(btf)) { - pr_warn("Error in init bpf_struct_ops %s\n", - st_ops->name); - } else { - st_ops->type_id = type_id; - st_ops->type = t; - st_ops->value_id = value_id; - st_ops->value_type = btf_type_by_id(btf, - value_id); - } - } - } +err_out: + kfree(info_buf); + + return -EINVAL; } -extern struct btf *btf_vmlinux; +/* Clean up the arg_info in a struct bpf_struct_ops_desc. */ +void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) +{ + struct bpf_struct_ops_arg_info *arg_info; + int i; -static const struct bpf_struct_ops * -bpf_struct_ops_find_value(u32 value_id) + arg_info = st_ops_desc->arg_info; + for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) + kfree(arg_info[i].info); + + kfree(arg_info); +} + +int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, + struct btf *btf, + struct bpf_verifier_log *log) { - unsigned int i; + struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; + struct bpf_struct_ops_arg_info *arg_info; + const struct btf_member *member; + const struct btf_type *t; + s32 type_id, value_id; + char value_name[128]; + const char *mname; + int i, err; - if (!value_id || !btf_vmlinux) - return NULL; + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + return -EINVAL; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); - for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { - if (bpf_struct_ops[i]->value_id == value_id) - return bpf_struct_ops[i]; + if (!st_ops->cfi_stubs) { + pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name); + return -EINVAL; } - return NULL; -} + type_id = btf_find_by_name_kind(btf, st_ops->name, + BTF_KIND_STRUCT); + if (type_id < 0) { + pr_warn("Cannot find struct %s in %s\n", + st_ops->name, btf_get_name(btf)); + return -EINVAL; + } + t = btf_type_by_id(btf, type_id); + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { + pr_warn("Cannot support #%u members in struct %s\n", + btf_type_vlen(t), st_ops->name); + return -EINVAL; + } -const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) -{ - unsigned int i; + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in %s\n", + value_name, btf_get_name(btf)); + return -EINVAL; + } + if (!is_valid_value_type(btf, value_id, t, value_name)) + return -EINVAL; - if (!type_id || !btf_vmlinux) - return NULL; + arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), + GFP_KERNEL); + if (!arg_info) + return -ENOMEM; + + st_ops_desc->arg_info = arg_info; + st_ops_desc->type = t; + st_ops_desc->type_id = type_id; + st_ops_desc->value_id = value_id; + st_ops_desc->value_type = btf_type_by_id(btf, value_id); + + for_each_member(i, t, member) { + const struct btf_type *func_proto; - for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { - if (bpf_struct_ops[i]->type_id == type_id) - return bpf_struct_ops[i]; + mname = btf_name_by_offset(btf, member->name_off); + if (!*mname) { + pr_warn("anon member in struct %s is not supported\n", + st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + + if (__btf_member_bitfield_size(t, member)) { + pr_warn("bit field member %s in struct %s is not supported\n", + mname, st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + + func_proto = btf_type_resolve_func_ptr(btf, + member->type, + NULL); + if (!func_proto) + continue; + + if (btf_distill_func_proto(log, btf, + func_proto, mname, + &st_ops->func_models[i])) { + pr_warn("Error in parsing func ptr %s in struct %s\n", + mname, st_ops->name); + err = -EINVAL; + goto errout; + } + + err = prepare_arg_info(btf, st_ops->name, mname, + func_proto, + arg_info + i); + if (err) + goto errout; } - return NULL; + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + err = -EINVAL; + goto errout; + } + + return 0; + +errout: + bpf_struct_ops_desc_release(st_ops_desc); + + return err; } static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, @@ -265,7 +446,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, kvalue = &st_map->kvalue; /* Pair with smp_store_release() during map_update */ - state = smp_load_acquire(&kvalue->state); + state = smp_load_acquire(&kvalue->common.state); if (state == BPF_STRUCT_OPS_STATE_INIT) { memset(value, 0, map->value_size); return 0; @@ -276,7 +457,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, */ uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); - uvalue->state = state; + uvalue->common.state = state; /* This value offers the user space a general estimate of how * many sockets are still utilizing this struct_ops for TCP @@ -284,7 +465,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, * should sufficiently meet our present goals. */ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); - refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); + refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0)); return 0; } @@ -296,10 +477,9 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { - const struct btf_type *t = st_map->st_ops->type; u32 i; - for (i = 0; i < btf_type_vlen(t); i++) { + for (i = 0; i < st_map->links_cnt; i++) { if (st_map->links[i]) { bpf_link_put(st_map->links[i]); st_map->links[i] = NULL; @@ -307,7 +487,16 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } -static int check_zero_holes(const struct btf_type *t, void *data) +static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) +{ + int i; + + for (i = 0; i < st_map->image_pages_cnt; i++) + bpf_struct_ops_image_free(st_map->image_pages[i]); + st_map->image_pages_cnt = 0; +} + +static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) { const struct btf_member *member; u32 i, moff, msize, prev_mend = 0; @@ -319,8 +508,8 @@ static int check_zero_holes(const struct btf_type *t, void *data) memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; - mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); + mtype = btf_type_by_id(btf, member->type); + mtype = btf_resolve_size(btf, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; @@ -352,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, - void *stub_func, void *image, void *image_end) + void *stub_func, + void **_image, u32 *_image_off, + bool allow_alloc) { - u32 flags = BPF_TRAMP_F_INDIRECT; + u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT; + void *image = *_image; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; @@ -364,27 +556,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); - if (size < 0) - return size; - if (size > (unsigned long)image_end - (unsigned long)image) - return -E2BIG; - return arch_prepare_bpf_trampoline(NULL, image, image_end, + if (size <= 0) + return size ? : -EFAULT; + + /* Allocate image buffer if necessary */ + if (!image || size > PAGE_SIZE - image_off) { + if (!allow_alloc) + return -E2BIG; + + image = bpf_struct_ops_image_alloc(); + if (IS_ERR(image)) + return PTR_ERR(image); + image_off = 0; + } + + size = arch_prepare_bpf_trampoline(NULL, image + image_off, + image + PAGE_SIZE, model, flags, tlinks, stub_func); + if (size <= 0) { + if (image != *_image) + bpf_struct_ops_image_free(image); + return size ? : -EFAULT; + } + + *_image = image; + *_image_off = image_off + size; + return 0; } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - const struct bpf_struct_ops *st_ops = st_map->st_ops; + const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; + const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_type *module_type; const struct btf_member *member; - const struct btf_type *t = st_ops->type; + const struct btf_type *t = st_ops_desc->type; struct bpf_tramp_links *tlinks; void *udata, *kdata; int prog_fd, err; - void *image, *image_end; - u32 i; + u32 i, trampoline_start, image_off = 0; + void *cur_image = NULL, *image = NULL; if (flags) return -EINVAL; @@ -392,16 +606,16 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (*(u32 *)key != 0) return -E2BIG; - err = check_zero_holes(st_ops->value_type, value); + err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value); if (err) return err; uvalue = value; - err = check_zero_holes(t, uvalue->data); + err = check_zero_holes(st_map->btf, t, uvalue->data); if (err) return err; - if (uvalue->state || refcount_read(&uvalue->refcnt)) + if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); @@ -413,7 +627,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, mutex_lock(&st_map->lock); - if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) { err = -EBUSY; goto unlock; } @@ -422,9 +636,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; - image = st_map->image; - image_end = st_map->image + PAGE_SIZE; + module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; @@ -432,7 +645,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, u32 moff; moff = __btf_member_bit_offset(t, member) / 8; - ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) goto reset_unlock; @@ -457,8 +670,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (!ptype || !btf_type_is_func_proto(ptype)) { u32 msize; - mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); + mtype = btf_type_by_id(st_map->btf, member->type); + mtype = btf_resolve_size(st_map->btf, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; @@ -484,7 +697,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || - prog->aux->attach_btf_id != st_ops->type_id || + prog->aux->attach_btf_id != st_ops_desc->type_id || prog->expected_attach_type != i) { bpf_prog_put(prog); err = -EINVAL; @@ -501,37 +714,47 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog); st_map->links[i] = &link->link; + trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, - &st_ops->func_models[i], - *(void **)(st_ops->cfi_stubs + moff), - image, image_end); + &st_ops->func_models[i], + *(void **)(st_ops->cfi_stubs + moff), + &image, &image_off, + st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); + if (err) + goto reset_unlock; + + if (cur_image != image) { + st_map->image_pages[st_map->image_pages_cnt++] = image; + cur_image = image; + trampoline_start = 0; + } if (err < 0) goto reset_unlock; - *(void **)(kdata + moff) = image + cfi_get_offset(); - image += err; + *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; } + if (st_ops->validate) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + } + for (i = 0; i < st_map->image_pages_cnt; i++) + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + if (st_map->map.map_flags & BPF_F_LINK) { err = 0; - if (st_ops->validate) { - err = st_ops->validate(kdata); - if (err) - goto reset_unlock; - } - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). */ - smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); + smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); goto unlock; } - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -545,7 +768,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. */ - smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); goto unlock; } @@ -554,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: + bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); @@ -575,12 +798,12 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) if (st_map->map.map_flags & BPF_F_LINK) return -EOPNOTSUPP; - prev_state = cmpxchg(&st_map->kvalue.state, + prev_state = cmpxchg(&st_map->kvalue.common.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: - st_map->st_ops->unreg(&st_map->kvalue.data); + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: @@ -597,6 +820,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; void *value; int err; @@ -606,7 +830,8 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); if (!err) { - btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + btf_type_seq_show(st_map->btf, + map->btf_vmlinux_value_type_id, value, m); seq_puts(m, "\n"); } @@ -621,16 +846,22 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - if (st_map->image) { - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); - bpf_jit_uncharge_modmem(PAGE_SIZE); - } + bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } static void bpf_struct_ops_map_free(struct bpf_map *map) { + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + /* st_ops->owner was acquired during map_alloc to implicitly holds + * the btf's refcnt. The acquire was only done when btf_is_module() + * st_map->btf cannot be NULL here. + */ + if (btf_is_module(st_map->btf)) + module_put(st_map->st_ops_desc->st_ops->owner); + /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to @@ -654,29 +885,61 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || - (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) + (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) || + !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { - const struct bpf_struct_ops *st_ops; + const struct bpf_struct_ops_desc *st_ops_desc; size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; + struct module *mod = NULL; struct bpf_map *map; + struct btf *btf; int ret; - st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); - if (!st_ops) - return ERR_PTR(-ENOTSUPP); + if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { + /* The map holds btf for its whole life time. */ + btf = btf_get_by_fd(attr->value_type_btf_obj_fd); + if (IS_ERR(btf)) + return ERR_CAST(btf); + if (!btf_is_module(btf)) { + btf_put(btf); + return ERR_PTR(-EINVAL); + } + + mod = btf_try_get_module(btf); + /* mod holds a refcnt to btf. We don't need an extra refcnt + * here. + */ + btf_put(btf); + if (!mod) + return ERR_PTR(-EINVAL); + } else { + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return ERR_CAST(btf); + if (!btf) + return ERR_PTR(-ENOTSUPP); + } + + st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); + if (!st_ops_desc) { + ret = -ENOTSUPP; + goto errout; + } - vt = st_ops->value_type; - if (attr->value_size != vt->size) - return ERR_PTR(-EINVAL); + vt = st_ops_desc->value_type; + if (attr->value_size != vt->size) { + ret = -EINVAL; + goto errout; + } - t = st_ops->type; + t = st_ops_desc->type; st_map_size = sizeof(*st_map) + /* kvalue stores the @@ -685,48 +948,43 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) (vt->size - sizeof(struct bpf_struct_ops_value)); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); - if (!st_map) - return ERR_PTR(-ENOMEM); + if (!st_map) { + ret = -ENOMEM; + goto errout; + } - st_map->st_ops = st_ops; + st_map->st_ops_desc = st_ops_desc; map = &st_map->map; - ret = bpf_jit_charge_modmem(PAGE_SIZE); - if (ret) { - __bpf_struct_ops_map_free(map); - return ERR_PTR(ret); - } - - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); - if (!st_map->image) { - /* __bpf_struct_ops_map_free() uses st_map->image as flag - * for "charged or not". In this case, we need to unchange - * here. - */ - bpf_jit_uncharge_modmem(PAGE_SIZE); - __bpf_struct_ops_map_free(map); - return ERR_PTR(-ENOMEM); - } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->links_cnt = btf_type_vlen(t); st_map->links = - bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), + bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), NUMA_NO_NODE); if (!st_map->uvalue || !st_map->links) { - __bpf_struct_ops_map_free(map); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto errout_free; } + st_map->btf = btf; mutex_init(&st_map->lock); bpf_map_init_from_attr(map, attr); return map; + +errout_free: + __bpf_struct_ops_map_free(map); +errout: + module_put(mod); + + return ERR_PTR(ret); } static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - const struct bpf_struct_ops *st_ops = st_map->st_ops; - const struct btf_type *vt = st_ops->value_type; + const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; + const struct btf_type *vt = st_ops_desc->value_type; u64 usage; usage = sizeof(*st_map) + @@ -785,7 +1043,7 @@ static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && map->map_flags & BPF_F_LINK && /* Pair with smp_store_release() during map_update */ - smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; + smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY; } static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) @@ -800,7 +1058,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) /* st_link->map can be NULL if * bpf_struct_ops_link_create() fails to register. */ - st_map->st_ops->unreg(&st_map->kvalue.data); + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); bpf_map_put(&st_map->map); } kfree(st_link); @@ -847,7 +1105,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map if (!bpf_struct_ops_valid_to_reg(new_map)) return -EINVAL; - if (!st_map->st_ops->update) + if (!st_map->st_ops_desc->st_ops->update) return -EOPNOTSUPP; mutex_lock(&update_mutex); @@ -860,12 +1118,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); /* The new and old struct_ops must be the same type. */ - if (st_map->st_ops != old_st_map->st_ops) { + if (st_map->st_ops_desc != old_st_map->st_ops_desc) { err = -EINVAL; goto err_out; } - err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); + err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); if (err) goto err_out; @@ -916,7 +1174,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) if (err) goto err_out; - err = st_map->st_ops->reg(st_map->kvalue.data); + err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -931,3 +1189,10 @@ err_out: kfree(link); return err; } + +void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + info->btf_vmlinux_id = btf_obj_id(st_map->btf); +} diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h deleted file mode 100644 index 5678a9ddf817..000000000000 --- a/kernel/bpf/bpf_struct_ops_types.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* internal file - do not include directly */ - -#ifdef CONFIG_BPF_JIT -#ifdef CONFIG_NET -BPF_STRUCT_OPS_TYPE(bpf_dummy_ops) -#endif -#ifdef CONFIG_INET -#include <net/tcp.h> -BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) -#endif -#endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 596471189176..90c4a32d89ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -19,6 +19,7 @@ #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf.h> #include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> @@ -241,6 +242,12 @@ struct btf_id_dtor_kfunc_tab { struct btf_id_dtor_kfunc dtors[]; }; +struct btf_struct_ops_tab { + u32 cnt; + u32 capacity; + struct bpf_struct_ops_desc ops[]; +}; + struct btf { void *data; struct btf_type **types; @@ -258,6 +265,7 @@ struct btf { struct btf_kfunc_set_tab *kfunc_set_tab; struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; struct btf_struct_metas *struct_meta_tab; + struct btf_struct_ops_tab *struct_ops_tab; /* split BTF support */ struct btf *base_btf; @@ -801,9 +809,23 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return __btf_name_valid(btf, offset); } +/* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset); + /* offset must be valid */ + const char *src = btf_str_by_offset(btf, offset); + const char *src_limit; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isprint(*src)) + return false; + src++; + } + + return !*src; } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -1688,11 +1710,27 @@ static void btf_free_struct_meta_tab(struct btf *btf) btf->struct_meta_tab = NULL; } +static void btf_free_struct_ops_tab(struct btf *btf) +{ + struct btf_struct_ops_tab *tab = btf->struct_ops_tab; + u32 i; + + if (!tab) + return; + + for (i = 0; i < tab->cnt; i++) + bpf_struct_ops_desc_release(&tab->ops[i]); + + kfree(tab); + btf->struct_ops_tab = NULL; +} + static void btf_free(struct btf *btf) { btf_free_struct_meta_tab(btf); btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); + btf_free_struct_ops_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); @@ -1707,6 +1745,11 @@ static void btf_free_rcu(struct rcu_head *rcu) btf_free(btf); } +const char *btf_get_name(const struct btf *btf) +{ + return btf->name; +} + void btf_get(struct btf *btf) { refcount_inc(&btf->refcnt); @@ -3310,30 +3353,48 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, return BTF_FIELD_FOUND; } -const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, - int comp_idx, const char *tag_key) +int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key, int last_id) { - const char *value = NULL; - int i; + int len = strlen(tag_key); + int i, n; - for (i = 1; i < btf_nr_types(btf); i++) { + for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) { const struct btf_type *t = btf_type_by_id(btf, i); - int len = strlen(tag_key); if (!btf_type_is_decl_tag(t)) continue; - if (pt != btf_type_by_id(btf, t->type) || - btf_type_decl_tag(t)->component_idx != comp_idx) + if (pt != btf_type_by_id(btf, t->type)) + continue; + if (btf_type_decl_tag(t)->component_idx != comp_idx) continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; - /* Prevent duplicate entries for same type */ - if (value) - return ERR_PTR(-EEXIST); - value = __btf_name_by_offset(btf, t->name_off) + len; + return i; } - if (!value) - return ERR_PTR(-ENOENT); + return -ENOENT; +} + +const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key) +{ + const char *value = NULL; + const struct btf_type *t; + int len, id; + + id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); + if (id < 0) + return ERR_PTR(id); + + t = btf_type_by_id(btf, id); + len = strlen(tag_key); + value = __btf_name_by_offset(btf, t->name_off) + len; + + /* Prevent duplicate entries for same type */ + id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id); + if (id >= 0) + return ERR_PTR(-EEXIST); + return value; } @@ -5647,15 +5708,29 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) return ctx_type->type; } -const struct btf_type * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *ctx_type; const char *tname, *ctx_tname; t = btf_type_by_id(btf, t->type); + + /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to + * check before we skip all the typedef below. + */ + if (prog_type == BPF_PROG_TYPE_KPROBE) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return true; + } + } + while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { @@ -5664,27 +5739,30 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - return NULL; + return false; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { bpf_log(log, "arg#%d struct doesn't have a name\n", arg); - return NULL; + return false; } ctx_type = find_canonical_prog_ctx_type(prog_type); if (!ctx_type) { bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ - return NULL; + return false; } again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); - return NULL; + return false; } + /* program types without named context types work only with arg:ctx tag */ + if (ctx_tname[0] == '\0') + return false; /* only compare that prog's ctx type name is the same as * kernel expects. No need to compare field by field. * It's ok for bpf prog to do: @@ -5693,20 +5771,20 @@ again: * { // no fields of skb are ever used } */ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) - return ctx_type; + return true; if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) - return ctx_type; + return true; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ if (!btf_type_is_modifier(ctx_type)) - return NULL; + return false; while (btf_type_is_modifier(ctx_type)) ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } - return ctx_type; + return true; } /* forward declarations for arch-specific underlying types of @@ -5858,7 +5936,7 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, enum bpf_prog_type prog_type, int arg) { - if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) + if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; return find_kern_ctx_type_id(prog_type); } @@ -5933,8 +6011,6 @@ struct btf *btf_parse_vmlinux(void) /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - bpf_struct_ops_init(btf, log); - refcount_set(&btf->refcnt, 1); err = btf_alloc_id(btf); @@ -6092,6 +6168,26 @@ static bool prog_args_trusted(const struct bpf_prog *prog) } } +int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, + u32 arg_no) +{ + const struct btf_param *args; + const struct btf_type *t; + int off = 0, i; + u32 sz; + + args = btf_params(func_proto); + for (i = 0; i < arg_no; i++) { + t = btf_type_by_id(btf, args[i].type); + t = btf_resolve_size(btf, t, &sz); + if (IS_ERR(t)) + return PTR_ERR(t); + off += roundup(sz, 8); + } + + return off; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6228,7 +6324,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } info->reg_type = ctx_arg_info->reg_type; - info->btf = btf_vmlinux; + info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } @@ -6284,6 +6380,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, __btf_name_by_offset(btf, t->name_off)); return true; } +EXPORT_SYMBOL_GPL(btf_ctx_access); enum bpf_struct_walk_result { /* < 0 error */ @@ -6946,6 +7043,81 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) return false; } +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static DEFINE_MUTEX(cand_cache_mutex); + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id); + +static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx, + const struct btf *btf, const struct btf_type *t) +{ + struct bpf_cand_cache *cc; + struct bpf_core_ctx ctx = { + .btf = btf, + .log = log, + }; + u32 kern_type_id, type_id; + int err = 0; + + /* skip PTR and modifiers */ + type_id = t->type; + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) { + type_id = t->type; + t = btf_type_by_id(btf, t->type); + } + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(&ctx, type_id); + if (IS_ERR(cc)) { + err = PTR_ERR(cc); + bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), + err); + goto cand_cache_unlock; + } + if (cc->cnt != 1) { + bpf_log(log, "arg#%d reference type('%s %s') %s\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off), + cc->cnt == 0 ? "has no matches" : "is ambiguous"); + err = cc->cnt == 0 ? -ENOENT : -ESRCH; + goto cand_cache_unlock; + } + if (btf_is_module(cc->cands[0].btf)) { + bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n", + arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + err = -EOPNOTSUPP; + goto cand_cache_unlock; + } + kern_type_id = cc->cands[0].id; + +cand_cache_unlock: + mutex_unlock(&cand_cache_mutex); + if (err) + return err; + + return kern_type_id; +} + +enum btf_arg_tag { + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_NULLABLE = BIT_ULL(3), + ARG_TAG_ARENA = BIT_ULL(4), +}; + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7009,6 +7181,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (!is_global) + return -EINVAL; bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; @@ -7018,6 +7192,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { + if (!is_global) + return -EINVAL; bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -7027,92 +7203,134 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { - bool is_nonnull = false; - const char *tag; + u32 tags = 0; + int id = 0; - t = btf_type_by_id(btf, args[i].type); - - tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); - if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) { - tag = NULL; - } else if (IS_ERR(tag)) { - bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag)); - return PTR_ERR(tag); - } /* 'arg:<tag>' decl_tag takes precedence over derivation of * register type from BTF type itself */ - if (tag) { + while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { + const struct btf_type *tag_t = btf_type_by_id(btf, id); + const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; + /* disallow arg tags in static subprogs */ if (!is_global) { bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); return -EOPNOTSUPP; } + if (strcmp(tag, "ctx") == 0) { - sub->args[i].arg_type = ARG_PTR_TO_CTX; - continue; + tags |= ARG_TAG_CTX; + } else if (strcmp(tag, "trusted") == 0) { + tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "nonnull") == 0) { + tags |= ARG_TAG_NONNULL; + } else if (strcmp(tag, "nullable") == 0) { + tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + tags |= ARG_TAG_ARENA; + } else { + bpf_log(log, "arg#%d has unsupported set of tags\n", i); + return -EOPNOTSUPP; } - if (strcmp(tag, "nonnull") == 0) - is_nonnull = true; + } + if (id != -ENOENT) { + bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); + return id; } + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_is_any_enum(t)) { - sub->args[i].arg_type = ARG_ANYTHING; - continue; - } - if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + if (!btf_type_is_ptr(t)) + goto skip_pointer; + + if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) { + if (tags & ~ARG_TAG_CTX) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } + if ((tags & ARG_TAG_CTX) && + btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } - if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) { + if (btf_is_dynptr_ptr(btf, t)) { + if (tags) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; continue; } - if (is_global && btf_type_is_ptr(t)) { + if (tags & ARG_TAG_TRUSTED) { + int kern_type_id; + + if (tags & ARG_TAG_NONNULL) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } + + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); + if (kern_type_id < 0) + return kern_type_id; + + sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED; + if (tags & ARG_TAG_NULLABLE) + sub->args[i].arg_type |= PTR_MAYBE_NULL; + sub->args[i].btf_id = kern_type_id; + continue; + } + if (tags & ARG_TAG_ARENA) { + if (tags & ~ARG_TAG_ARENA) { + bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_ARENA; + continue; + } + if (is_global) { /* generic user data pointer */ u32 mem_size; + if (tags & ARG_TAG_NULLABLE) { + bpf_log(log, "arg#%d has invalid combination of tags\n", i); + return -EINVAL; + } + t = btf_type_skip_modifiers(btf, t->type, NULL); ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { - bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), PTR_ERR(ref_t)); return -EINVAL; } - sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL; + sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL; + if (tags & ARG_TAG_NONNULL) + sub->args[i].arg_type &= ~PTR_MAYBE_NULL; sub->args[i].mem_size = mem_size; continue; } - if (is_nonnull) { - bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i); + +skip_pointer: + if (tags) { + bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i); return -EINVAL; } + if (btf_type_is_int(t) || btf_is_any_enum(t)) { + sub->args[i].arg_type = ARG_ANYTHING; + continue; + } + if (!is_global) + return -EINVAL; bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; } - for (i = 0; i < nargs; i++) { - const char *tag; - - if (sub->args[i].arg_type != ARG_PTR_TO_CTX) - continue; - - /* check if arg has "arg:ctx" tag */ - t = btf_type_by_id(btf, args[i].type); - tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); - if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) - continue; - - if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, - prog->expected_attach_type)) - return -EINVAL; - } - sub->arg_cnt = nargs; sub->args_cached = true; @@ -7589,6 +7807,17 @@ static struct btf *btf_get_module_btf(const struct module *module) return btf; } +static int check_btf_kconfigs(const struct module *module, const char *feature) +{ + if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register %s\n", feature); + return -ENOENT; + } + if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + pr_warn("missing module BTF, cannot register %s\n", feature); + return 0; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { struct btf *btf = NULL; @@ -7949,15 +8178,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, int ret, i; btf = btf_get_module_btf(kset->owner); - if (!btf) { - if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - pr_err("missing vmlinux BTF, cannot register kfuncs\n"); - return -ENOENT; - } - if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) - pr_warn("missing module BTF, cannot register kfuncs\n"); - return 0; - } + if (!btf) + return check_btf_kconfigs(kset->owner, "kfunc"); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -7981,6 +8203,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { enum btf_kfunc_hook hook; + /* All kfuncs need to be tagged as such in BTF. + * WARN() for initcall registrations that do not check errors. + */ + if (!(kset->set->flags & BTF_SET8_KFUNCS)) { + WARN_ON(!kset->owner); + return -EINVAL; + } + hook = bpf_prog_type_to_kfunc_hook(prog_type); return __register_btf_kfunc_id_set(hook, kset); } @@ -8057,17 +8287,8 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c int ret; btf = btf_get_module_btf(owner); - if (!btf) { - if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); - return -ENOENT; - } - if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { - pr_err("missing module BTF, cannot register dtor kfuncs\n"); - return -ENOENT; - } - return 0; - } + if (!btf) + return check_btf_kconfigs(owner, "dtor kfuncs"); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -8182,17 +8403,6 @@ size_t bpf_core_essential_name_len(const char *name) return n; } -struct bpf_cand_cache { - const char *name; - u32 name_len; - u16 kind; - u16 cnt; - struct { - const struct btf *btf; - u32 id; - } cands[]; -}; - static void bpf_free_cands(struct bpf_cand_cache *cands) { if (!cands->cnt) @@ -8213,8 +8423,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; #define MODULE_CAND_CACHE_SIZE 31 static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; -static DEFINE_MUTEX(cand_cache_mutex); - static void __print_cand_cache(struct bpf_verifier_log *log, struct bpf_cand_cache **cache, int cache_size) @@ -8645,3 +8853,141 @@ bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, return !strncmp(reg_name, arg_name, cmp_len); } + +#ifdef CONFIG_BPF_JIT +static int +btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, + struct bpf_verifier_log *log) +{ + struct btf_struct_ops_tab *tab, *new_tab; + int i, err; + + tab = btf->struct_ops_tab; + if (!tab) { + tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), + GFP_KERNEL); + if (!tab) + return -ENOMEM; + tab->capacity = 4; + btf->struct_ops_tab = tab; + } + + for (i = 0; i < tab->cnt; i++) + if (tab->ops[i].st_ops == st_ops) + return -EEXIST; + + if (tab->cnt == tab->capacity) { + new_tab = krealloc(tab, + offsetof(struct btf_struct_ops_tab, + ops[tab->capacity * 2]), + GFP_KERNEL); + if (!new_tab) + return -ENOMEM; + tab = new_tab; + tab->capacity *= 2; + btf->struct_ops_tab = tab; + } + + tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops; + + err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log); + if (err) + return err; + + btf->struct_ops_tab->cnt++; + + return 0; +} + +const struct bpf_struct_ops_desc * +bpf_struct_ops_find_value(struct btf *btf, u32 value_id) +{ + const struct bpf_struct_ops_desc *st_ops_list; + unsigned int i; + u32 cnt; + + if (!value_id) + return NULL; + if (!btf->struct_ops_tab) + return NULL; + + cnt = btf->struct_ops_tab->cnt; + st_ops_list = btf->struct_ops_tab->ops; + for (i = 0; i < cnt; i++) { + if (st_ops_list[i].value_id == value_id) + return &st_ops_list[i]; + } + + return NULL; +} + +const struct bpf_struct_ops_desc * +bpf_struct_ops_find(struct btf *btf, u32 type_id) +{ + const struct bpf_struct_ops_desc *st_ops_list; + unsigned int i; + u32 cnt; + + if (!type_id) + return NULL; + if (!btf->struct_ops_tab) + return NULL; + + cnt = btf->struct_ops_tab->cnt; + st_ops_list = btf->struct_ops_tab->ops; + for (i = 0; i < cnt; i++) { + if (st_ops_list[i].type_id == type_id) + return &st_ops_list[i]; + } + + return NULL; +} + +int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops) +{ + struct bpf_verifier_log *log; + struct btf *btf; + int err = 0; + + btf = btf_get_module_btf(st_ops->owner); + if (!btf) + return check_btf_kconfigs(st_ops->owner, "struct_ops"); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN); + if (!log) { + err = -ENOMEM; + goto errout; + } + + log->level = BPF_LOG_KERNEL; + + err = btf_add_struct_ops(btf, st_ops, log); + +errout: + kfree(log); + btf_put(btf); + + return err; +} +EXPORT_SYMBOL_GPL(__register_bpf_struct_ops); +#endif + +bool btf_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix) +{ + int suffix_len = strlen(suffix), len; + const char *param_name; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len <= suffix_len) + return false; + param_name += len - suffix_len; + return !strncmp(param_name, suffix, suffix_len); +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 491d20038cbe..82243cb6c54d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1358,15 +1358,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype) { - unsigned int offset = skb->data - skb_network_header(skb); + unsigned int offset = -skb_network_offset(skb); struct sock *save_sk; void *saved_data_end; struct cgroup *cgrp; int ret; - if (!sk || !sk_fullsock(sk)) - return 0; - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; @@ -1630,7 +1627,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2191,7 +2188,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2348,7 +2345,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ea6843be2616..696bc55de8e8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -88,13 +88,18 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ +enum page_size_enum { + __PAGE_SIZE = PAGE_SIZE +}; + struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; - size = round_up(size, PAGE_SIZE); + size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -682,7 +687,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !bpf_capable()) + !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); @@ -888,7 +893,12 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) +/* PMD_SIZE is really big for some archs. It doesn't make sense to + * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to + * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be + * greater than or equal to 2MB. + */ +#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif @@ -1675,6 +1685,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -2695,7 +2706,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, bool sleepable; u32 i; - sleepable = aux->sleepable; + sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) @@ -2779,6 +2790,7 @@ void bpf_prog_free(struct bpf_prog *fp) if (aux->dst_prog) bpf_prog_put(aux->dst_prog); + bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } @@ -2925,6 +2937,21 @@ bool __weak bpf_jit_supports_far_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_arena(void) +{ + return false; +} + +/* Return TRUE if the JIT backend satisfies the following two conditions: + * 1) JIT backend supports atomic_xchg() on pointer-sized words. + * 2) Under the specific arch, the implementation of xchg() is the same + * as atomic_xchg() on pointer-sized words. + */ +bool __weak bpf_jit_supports_ptr_xchg(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ @@ -2959,6 +2986,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +/* for configs without MMU or 32-bit */ +__weak const struct bpf_map_ops arena_map_ops; +__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return 0; +} +__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return 0; +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef82ffc90cbe..a8e34416e960 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -24,6 +24,7 @@ #include <linux/filter.h> #include <linux/ptr_ring.h> #include <net/xdp.h> +#include <net/hotdata.h> #include <linux/sched.h> #include <linux/workqueue.h> @@ -262,6 +263,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + unsigned long last_qs = jiffies; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -287,10 +289,12 @@ static int cpu_map_kthread_run(void *data) if (__ptr_ring_empty(rcpu->queue)) { schedule(); sched = 1; + last_qs = jiffies; } else { __set_current_state(TASK_RUNNING); } } else { + rcu_softirq_qs_periodic(last_qs); sched = cond_resched(); } @@ -326,7 +330,8 @@ static int cpu_map_kthread_run(void *data) /* Support running another XDP prog on this CPU */ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); if (nframes) { - m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs); + m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + gfp, nframes, skbs); if (unlikely(m == 0)) { for (i = 0; i < nframes; i++) skbs[i] = NULL; /* effect: xdp_return_frame */ diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 2e73533a3811..dad0fb1c8e87 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -424,7 +424,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) __bpf_kfunc_end_defs(); -BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) @@ -450,7 +450,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) -BTF_SET8_END(cpumask_kfunc_btf_ids) +BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a936c704d4e7..4e2cdbb5629f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -130,13 +130,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - - if (!dtab->n_buckets) /* Overflow check */ + /* hash table size must be power of 2; roundup_pow_of_two() can + * overflow into UB on 32-bit arches, so check that first + */ + if (dtab->map.max_entries > 1UL << 31) return -EINVAL; - } - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 49940c26a227..bd2e2dd04740 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn) (insn->off == 8 || insn->off == 16 || insn->off == 32); } +static bool is_addr_space_cast(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); + } else if (is_addr_space_cast(insn)) { + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + insn->code, insn->dst_reg, + insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', @@ -322,6 +332,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JCOND) && + insn->src_reg == BPF_MAY_GOTO) { + verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", + insn->code, insn->off); } else if (insn->code == (BPF_JMP32 | BPF_JA)) { verbose(cbs->private_data, "(%02x) gotol pc%+d\n", insn->code, insn->imm); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03a6a2500b6a..3a088a5349bc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -499,7 +499,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) num_possible_cpus()); } - /* hash table size must be power of 2 */ + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + err = -E2BIG; + if (htab->map.max_entries > 1UL << 31) + goto free_htab; + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct htab_elem) + @@ -509,10 +515,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); - err = -E2BIG; - /* prevent zero size kmalloc and check for u32 overflow */ - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) + /* check for u32 overflow */ + if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; err = bpf_map_init_elem_count(&htab->map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d19cd863d294..a89587859571 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -334,7 +334,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) __this_cpu_write(irqsave_flags, flags); } -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; @@ -357,7 +357,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) local_irq_restore(flags); } -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; @@ -1417,6 +1417,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) { unsigned long *kptr = map_value; + /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); } @@ -1682,7 +1683,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -1733,7 +1734,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { @@ -1791,7 +1792,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { @@ -2486,9 +2487,9 @@ __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) return obj; } -__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k) { - return obj__ign; + return (void *)obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) @@ -2546,7 +2547,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) __bpf_kfunc_end_defs(); -BTF_SET8_START(generic_btf_ids) +BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif @@ -2575,7 +2576,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) -BTF_SET8_END(generic_btf_ids) +BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, @@ -2591,7 +2592,7 @@ BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif -BTF_SET8_START(common_btf_ids) +BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) @@ -2620,7 +2621,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) -BTF_SET8_END(common_btf_ids) +BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 41e0a55c35f5..af5d2ffadd70 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> +#include <linux/kstrtox.h> #include "preload/bpf_preload.h" enum bpf_type { @@ -98,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; -static struct inode *bpf_get_inode(struct super_block *sb, - const struct inode *dir, - umode_t mode) +struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) { struct inode *inode; @@ -594,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); +struct bpffs_btf_enums { + const struct btf *btf; + const struct btf_type *cmd_t; + const struct btf_type *map_t; + const struct btf_type *prog_t; + const struct btf_type *attach_t; +}; + +static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) +{ + const struct btf *btf; + const struct btf_type *t; + const char *name; + int i, n; + + memset(info, 0, sizeof(*info)); + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -ENOENT; + + info->btf = btf; + + for (i = 1, n = btf_nr_types(btf); i < n; i++) { + t = btf_type_by_id(btf, i); + if (!btf_type_is_enum(t)) + continue; + + name = btf_name_by_offset(btf, t->name_off); + if (!name) + continue; + + if (strcmp(name, "bpf_cmd") == 0) + info->cmd_t = t; + else if (strcmp(name, "bpf_map_type") == 0) + info->map_t = t; + else if (strcmp(name, "bpf_prog_type") == 0) + info->prog_t = t; + else if (strcmp(name, "bpf_attach_type") == 0) + info->attach_t = t; + else + continue; + + if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) + return 0; + } + + return -ESRCH; +} + +static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, + const char *prefix, const char *str, int *value) +{ + const struct btf_enum *e; + const char *name; + int i, n, pfx_len = strlen(prefix); + + *value = 0; + + if (!btf || !enum_t) + return false; + + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + + /* match symbolic name case insensitive and ignoring prefix */ + if (strcasecmp(name + pfx_len, str) == 0) { + *value = e->val; + return true; + } + } + + return false; +} + +static void seq_print_delegate_opts(struct seq_file *m, + const char *opt_name, + const struct btf *btf, + const struct btf_type *enum_t, + const char *prefix, + u64 delegate_msk, u64 any_msk) +{ + const struct btf_enum *e; + bool first = true; + const char *name; + u64 msk; + int i, n, pfx_len = strlen(prefix); + + delegate_msk &= any_msk; /* clear unknown bits */ + + if (delegate_msk == 0) + return; + + seq_printf(m, ",%s", opt_name); + if (delegate_msk == any_msk) { + seq_printf(m, "=any"); + return; + } + + if (btf && enum_t) { + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + msk = 1ULL << e->val; + if (delegate_msk & msk) { + /* emit lower-case name without prefix */ + seq_printf(m, "%c", first ? '=' : ':'); + name += pfx_len; + while (*name) { + seq_printf(m, "%c", tolower(*name)); + name++; + } + + delegate_msk &= ~msk; + first = false; + } + } + } + if (delegate_msk) + seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); +} + /* * Display the mount options in /proc/mounts. */ @@ -601,6 +732,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct inode *inode = d_inode(root); umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; + struct bpf_mount_opts *opts = root->d_sb->s_fs_info; + u64 mask; if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", @@ -610,6 +743,35 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); + + if (opts->delegate_cmds || opts->delegate_maps || + opts->delegate_progs || opts->delegate_attachs) { + struct bpffs_btf_enums info; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + mask = (1ULL << __MAX_BPF_CMD) - 1; + seq_print_delegate_opts(m, "delegate_cmds", + info.btf, info.cmd_t, "BPF_", + opts->delegate_cmds, mask); + + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_maps", + info.btf, info.map_t, "BPF_MAP_TYPE_", + opts->delegate_maps, mask); + + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_progs", + info.btf, info.prog_t, "BPF_PROG_TYPE_", + opts->delegate_progs, mask); + + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_attachs", + info.btf, info.attach_t, "BPF_", + opts->delegate_attachs, mask); + } + return 0; } @@ -624,7 +786,7 @@ static void bpf_free_inode(struct inode *inode) free_inode_nonrcu(inode); } -static const struct super_operations bpf_super_ops = { +const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, @@ -635,28 +797,30 @@ enum { OPT_UID, OPT_GID, OPT_MODE, + OPT_DELEGATE_CMDS, + OPT_DELEGATE_MAPS, + OPT_DELEGATE_PROGS, + OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32 ("uid", OPT_UID), fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), + fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), + fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), + fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), + fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; -struct bpf_mount_opts { - kuid_t uid; - kgid_t gid; - umode_t mode; -}; - static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; kuid_t uid; kgid_t gid; - int opt; + int opt, err; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -708,6 +872,67 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; + case OPT_DELEGATE_CMDS: + case OPT_DELEGATE_MAPS: + case OPT_DELEGATE_PROGS: + case OPT_DELEGATE_ATTACHS: { + struct bpffs_btf_enums info; + const struct btf_type *enum_t; + const char *enum_pfx; + u64 *delegate_msk, msk = 0; + char *p; + int val; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + switch (opt) { + case OPT_DELEGATE_CMDS: + delegate_msk = &opts->delegate_cmds; + enum_t = info.cmd_t; + enum_pfx = "BPF_"; + break; + case OPT_DELEGATE_MAPS: + delegate_msk = &opts->delegate_maps; + enum_t = info.map_t; + enum_pfx = "BPF_MAP_TYPE_"; + break; + case OPT_DELEGATE_PROGS: + delegate_msk = &opts->delegate_progs; + enum_t = info.prog_t; + enum_pfx = "BPF_PROG_TYPE_"; + break; + case OPT_DELEGATE_ATTACHS: + delegate_msk = &opts->delegate_attachs; + enum_t = info.attach_t; + enum_pfx = "BPF_"; + break; + default: + return -EINVAL; + } + + while ((p = strsep(¶m->string, ":"))) { + if (strcmp(p, "any") == 0) { + msk |= ~0ULL; + } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { + msk |= 1ULL << val; + } else { + err = kstrtou64(p, 0, &msk); + if (err) + return err; + } + } + + /* Setting delegation mount options requires privileges */ + if (msk && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + *delegate_msk |= msk; + break; + } + default: + /* ignore unknown mount options */ + break; } return 0; @@ -784,10 +1009,14 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; int ret; + /* Mounting an instance of BPF FS requires privileges */ + if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -811,7 +1040,7 @@ static int bpf_get_tree(struct fs_context *fc) static void bpf_free_fc(struct fs_context *fc) { - kfree(fc->fs_private); + kfree(fc->s_fs_info); } static const struct fs_context_operations bpf_context_ops = { @@ -835,17 +1064,32 @@ static int bpf_init_fs_context(struct fs_context *fc) opts->uid = current_fsuid(); opts->gid = current_fsgid(); - fc->fs_private = opts; + /* start out with no BPF token delegation enabled */ + opts->delegate_cmds = 0; + opts->delegate_maps = 0; + opts->delegate_progs = 0; + opts->delegate_attachs = 0; + + fc->s_fs_info = opts; fc->ops = &bpf_context_ops; return 0; } +static void bpf_kill_super(struct super_block *sb) +{ + struct bpf_mount_opts *opts = sb->s_fs_info; + + kill_litter_super(sb); + kfree(opts); +} + static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = bpf_kill_super, + .fs_flags = FS_USERNS_MOUNT, }; static int __init bpf_init(void) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 594a234f122b..2a243cf37c60 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -9,6 +9,7 @@ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/math64.h> +#include <linux/string.h> #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -333,7 +334,8 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) { const struct bpf_line_info *linfo; const struct bpf_prog *prog; - u32 i, nr_linfo; + u32 nr_linfo; + int l, r, m; prog = env->prog; nr_linfo = prog->aux->nr_linfo; @@ -342,11 +344,30 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) return NULL; linfo = prog->aux->linfo; - for (i = 1; i < nr_linfo; i++) - if (insn_off < linfo[i].insn_off) - break; + /* Loop invariant: linfo[l].insn_off <= insns_off. + * linfo[0].insn_off == 0 which always satisfies above condition. + * Binary search is searching for rightmost linfo entry that satisfies + * the above invariant, giving us the desired record that covers given + * instruction offset. + */ + l = 0; + r = nr_linfo - 1; + while (l < r) { + /* (r - l + 1) / 2 means we break a tie to the right, so if: + * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off, + * then m=2, we see that linfo[m].insn_off > insn_off, and so + * r becomes 1 and we exit the loop with correct l==1. + * If the tie was broken to the left, m=1 would end us up in + * an endless loop where l and m stay at 1 and r stays at 2. + */ + m = l + (r - l + 1) / 2; + if (linfo[m].insn_off <= insn_off) + l = m; + else + r = m - 1; + } - return &linfo[i - 1]; + return &linfo[l]; } static const char *ltrim(const char *s) @@ -361,13 +382,28 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...) { - const struct bpf_line_info *linfo; + const struct bpf_line_info *linfo, *prev_linfo; + const struct btf *btf; + const char *s, *fname; if (!bpf_verifier_log_needed(&env->log)) return; + prev_linfo = env->prev_linfo; linfo = find_linfo(env, insn_off); - if (!linfo || linfo == env->prev_linfo) + if (!linfo || linfo == prev_linfo) + return; + + /* It often happens that two separate linfo records point to the same + * source code line, but have differing column numbers. Given verifier + * log doesn't emit column information, from user perspective we just + * end up emitting the same source code line twice unnecessarily. + * So instead check that previous and current linfo record point to + * the same file (file_name_offs match) and the same line number, and + * avoid emitting duplicated source code line in such case. + */ + if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off && + BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col)) return; if (prefix_fmt) { @@ -378,9 +414,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, va_end(args); } - verbose(env, "%s\n", - ltrim(btf_name_by_offset(env->prog->aux->btf, - linfo->line_off))); + btf = env->prog->aux->btf; + s = ltrim(btf_name_by_offset(btf, linfo->line_off)); + verbose(env, "%s", s); /* source code line */ + + s = btf_name_by_offset(btf, linfo->file_name_off); + /* leave only file name */ + fname = strrchr(s, '/'); + fname = fname ? fname + 1 : s; + verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col)); env->prev_linfo = linfo; } @@ -416,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_MEM] = "mem", + [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", @@ -651,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env, } verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_ARENA) + return; if (t == PTR_TO_STACK) { if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b32be680da6c..050fe1ebf0f7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -164,13 +164,13 @@ static inline int extract_bit(const u8 *data, size_t index) */ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key *key) + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); - BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32)); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) @@ -229,7 +229,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *found = NULL; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; if (key->prefixlen > trie->max_prefixlen) return NULL; @@ -309,7 +309,7 @@ static long trie_update_elem(struct bpf_map *map, struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; struct lpm_trie_node __rcu **slot; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -437,7 +437,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; unsigned long irq_flags; @@ -536,7 +536,7 @@ out: sizeof(struct lpm_trie_node)) #define LPM_VAL_SIZE_MIN 1 -#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X)) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) @@ -565,7 +565,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - - offsetof(struct bpf_lpm_trie_key, data); + offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; spin_lock_init(&trie->lock); @@ -616,7 +616,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key; struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; @@ -703,7 +703,7 @@ find_leftmost: } do_copy: next_key->prefixlen = next_node->prefixlen; - memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data), next_node->data, trie->data_size); free_stack: kfree(node_stack); @@ -715,7 +715,7 @@ static int trie_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - /* Keys must have struct bpf_lpm_trie_key embedded. */ + /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? -EINVAL : 0; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6abd7c5df4b3..9575314f40a6 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index dff7ba539701..c99f8e5234ac 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); - /* hash table size must be power of 2 */ - n_buckets = roundup_pow_of_two(attr->max_entries); - if (!n_buckets) + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); + n_buckets = roundup_pow_of_two(attr->max_entries); + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1f18681721c..ae2ff73bde7e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_ARENA || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || @@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **pages) +{ + unsigned long i, j; + struct page *pg; + int ret = 0; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg, *old_memcg; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); +#endif + for (i = 0; i < nr_pages; i++) { + pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + + if (pg) { + pages[i] = pg; + continue; + } + for (j = 0; j < i; j++) + __free_page(pages[j]); + ret = -ENOMEM; + break; + } + +#ifdef CONFIG_MEMCG_KMEM + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +#endif + return ret; +} + + static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; @@ -937,6 +971,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) return EPOLLERR; } +static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_get_unmapped_area) + return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); +#ifdef CONFIG_MMU + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +#else + return addr; +#endif +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -946,6 +995,7 @@ const struct file_operations bpf_map_fops = { .write = bpf_dummy_write, .mmap = bpf_map_mmap, .poll = bpf_map_poll, + .get_unmapped_area = bpf_get_unmapped_area, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -1011,8 +1061,8 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int map_check_btf(struct bpf_map *map, struct bpf_token *token, + const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; @@ -1040,7 +1090,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!IS_ERR_OR_NULL(map->record)) { int i; - if (!bpf_capable()) { + if (!bpf_token_capable(token, CAP_BPF)) { ret = -EPERM; goto free_map_tab; } @@ -1123,14 +1173,21 @@ free_map_tab: return ret; } -#define BPF_MAP_CREATE_LAST_FIELD map_extra +static bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + +#define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; + struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; + bool token_flag; int f_flags; int err; @@ -1138,6 +1195,12 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it + * to avoid per-map type checks tripping on unknown flag + */ + token_flag = attr->map_flags & BPF_F_TOKEN_FD; + attr->map_flags &= ~BPF_F_TOKEN_FD; + if (attr->btf_vmlinux_value_type_id) { if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || attr->btf_key_type_id || attr->btf_value_type_id) @@ -1147,6 +1210,7 @@ static int map_create(union bpf_attr *attr) } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_type != BPF_MAP_TYPE_ARENA && attr->map_extra != 0) return -EINVAL; @@ -1178,14 +1242,32 @@ static int map_create(union bpf_attr *attr) if (!ops->map_mem_usage) return -EINVAL; + if (token_flag) { + token = bpf_token_get_from_fd(attr->map_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + + /* if current token doesn't grant map creation permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || + !bpf_token_allow_map_type(token, attr->map_type)) { + bpf_token_put(token); + token = NULL; + } + } + + err = -EPERM; + /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) + goto put_token; /* check privileged map type permissions */ switch (map_type) { @@ -1218,25 +1300,28 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: - if (!bpf_capable()) - return -EPERM; + case BPF_MAP_TYPE_ARENA: + if (!bpf_token_capable(token, CAP_BPF)) + goto put_token; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + if (!bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; break; default: WARN(1, "unsupported map type %d", map_type); - return -EPERM; + goto put_token; } map = ops->map_alloc(attr); - if (IS_ERR(map)) - return PTR_ERR(map); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto put_token; + } map->ops = ops; map->map_type = map_type; @@ -1273,7 +1358,7 @@ static int map_create(union bpf_attr *attr) map->btf = btf; if (attr->btf_value_type_id) { - err = map_check_btf(map, btf, attr->btf_key_type_id, + err = map_check_btf(map, token, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; @@ -1285,15 +1370,16 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = security_bpf_map_create(map, attr, token); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_alloc_id(map); if (err) goto free_map_sec; bpf_map_save_memcg(map); + bpf_token_put(token); err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -1314,6 +1400,8 @@ free_map_sec: free_map: btf_put(map->btf); map->ops->map_free(map); +put_token: + bpf_token_put(token); return err; } @@ -2144,7 +2232,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); - security_bpf_prog_free(aux); + security_bpf_prog_free(aux->prog); bpf_prog_free(aux->prog); } @@ -2160,7 +2248,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->attach_btf); if (deferred) { - if (prog->aux->sleepable) + if (prog->sleepable) call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); else call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -2590,13 +2678,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_true_size +#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; + struct bpf_token *token = NULL; + bool bpf_cap; int err; char license[128]; @@ -2610,13 +2700,35 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | BPF_F_XDP_DEV_BOUND_ONLY | - BPF_F_TEST_REG_INVARIANTS)) + BPF_F_TEST_REG_INVARIANTS | + BPF_F_TOKEN_FD)) return -EINVAL; + bpf_prog_load_fixup_attach_type(attr); + + if (attr->prog_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->prog_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + /* if current token doesn't grant prog loading permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || + !bpf_token_allow_prog_type(token, attr->prog_type, + attr->expected_attach_type)) { + bpf_token_put(token); + token = NULL; + } + } + + bpf_cap = bpf_token_capable(token, CAP_BPF); + err = -EPERM; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend @@ -2625,21 +2737,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) * capability checks are still carried out for these * and other operations. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_cap) + goto put_token; if (attr->insn_cnt == 0 || - attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) - return -E2BIG; + attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { + err = -E2BIG; + goto put_token; + } if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (is_perfmon_prog_type(type) && !perfmon_capable()) - return -EPERM; + if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; + if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) + goto put_token; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is @@ -2649,27 +2763,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); - if (IS_ERR(attach_btf)) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = -EINVAL; + goto put_token; + } if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); - return -ENOTSUPP; + err = -ENOTSUPP; + goto put_token; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); - if (IS_ERR(attach_btf)) - return PTR_ERR(attach_btf); - if (!attach_btf) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = PTR_ERR(attach_btf); + goto put_token; + } + if (!attach_btf) { + err = -EINVAL; + goto put_token; + } btf_get(attach_btf); } - bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { @@ -2677,7 +2797,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -EINVAL; + err = -EINVAL; + goto put_token; } /* plain bpf_prog allocation */ @@ -2687,20 +2808,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -ENOMEM; + err = -EINVAL; + goto put_token; } prog->expected_attach_type = attr->expected_attach_type; + prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; - prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; - err = security_bpf_prog_alloc(prog->aux); - if (err) - goto free_prog; + /* move token into prog->aux, reuse taken refcnt */ + prog->aux->token = token; + token = NULL; prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; @@ -2709,12 +2831,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) - goto free_prog_sec; + goto free_prog; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) - goto free_prog_sec; + goto free_prog; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ @@ -2728,14 +2850,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) - goto free_prog_sec; + goto free_prog; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) - goto free_prog_sec; + goto free_prog; } /* @@ -2757,12 +2879,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog_sec; + goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) + goto free_prog; + + err = security_bpf_prog_load(prog, attr, token); + if (err) goto free_prog_sec; /* run eBPF verifier */ @@ -2808,13 +2934,16 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; + free_prog_sec: - free_uid(prog->aux->user); - security_bpf_prog_free(prog->aux); + security_bpf_prog_free(prog); free_prog: + free_uid(prog->aux->user); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); +put_token: + bpf_token_put(token); return err; } @@ -3501,6 +3630,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; + info->perf_event.kprobe.cookie = event->bpf_cookie; return 0; } #endif @@ -3526,6 +3656,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, else info->perf_event.type = BPF_PERF_EVENT_UPROBE; info->perf_event.uprobe.offset = offset; + info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; } #endif @@ -3553,6 +3684,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.cookie = event->bpf_cookie; return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); } @@ -3561,6 +3693,7 @@ static int bpf_perf_link_fill_perf_event(const struct perf_event *event, { info->perf_event.event.type = event->attr.type; info->perf_event.event.config = event->attr.config; + info->perf_event.event.cookie = event->bpf_cookie; info->perf_event.type = BPF_PERF_EVENT_EVENT; return 0; } @@ -3818,7 +3951,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!capable(CAP_NET_ADMIN)) + if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ @@ -4021,7 +4154,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; @@ -4320,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, continue; } + if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || + BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { + insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; + continue; + } + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; @@ -4687,6 +4826,8 @@ static int bpf_map_get_info_by_fd(struct file *file, info.btf_value_type_id = map->btf_value_type_id; } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) + bpf_map_struct_ops_info_fill(&info, map); if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); @@ -4789,15 +4930,34 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size +#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!bpf_capable()) + if (attr->btf_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->btf_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->btf_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_BPF)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_new_fd(attr, uattr, uattr_size); } @@ -5394,7 +5554,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr) /* The bpf program will not access the bpf map, but for the sake of * simplicity, increase sleepable_refcnt for sleepable program as well. */ - if (prog->aux->sleepable) + if (prog->sleepable) atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); @@ -5415,6 +5575,20 @@ out_prog_put: return ret; } +#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd + +static int token_create(union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_TOKEN_CREATE)) + return -EINVAL; + + /* no flags are supported yet */ + if (attr->token_create.flags) + return -EINVAL; + + return bpf_token_create(attr); +} + static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5548,6 +5722,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; + case BPF_TOKEN_CREATE: + err = token_create(&attr); + break; default: err = -EINVAL; break; @@ -5654,7 +5831,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } BPF_CALL_1(bpf_sys_close, u32, fd) @@ -5704,7 +5881,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; + return !bpf_token_capable(prog->aux->token, CAP_PERFMON) + ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c new file mode 100644 index 000000000000..d6ccf8d00eab --- /dev/null +++ b/kernel/bpf/token.c @@ -0,0 +1,278 @@ +#include <linux/bpf.h> +#include <linux/vmalloc.h> +#include <linux/fdtable.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/idr.h> +#include <linux/namei.h> +#include <linux/user_namespace.h> +#include <linux/security.h> + +static bool bpf_ns_capable(struct user_namespace *ns, int cap) +{ + return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN)); +} + +bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + struct user_namespace *userns; + + /* BPF token allows ns_capable() level of capabilities */ + userns = token ? token->userns : &init_user_ns; + if (!bpf_ns_capable(userns, cap)) + return false; + if (token && security_bpf_token_capable(token, cap) < 0) + return false; + return true; +} + +void bpf_token_inc(struct bpf_token *token) +{ + atomic64_inc(&token->refcnt); +} + +static void bpf_token_free(struct bpf_token *token) +{ + security_bpf_token_free(token); + put_user_ns(token->userns); + kfree(token); +} + +static void bpf_token_put_deferred(struct work_struct *work) +{ + struct bpf_token *token = container_of(work, struct bpf_token, work); + + bpf_token_free(token); +} + +void bpf_token_put(struct bpf_token *token) +{ + if (!token) + return; + + if (!atomic64_dec_and_test(&token->refcnt)) + return; + + INIT_WORK(&token->work, bpf_token_put_deferred); + schedule_work(&token->work); +} + +static int bpf_token_release(struct inode *inode, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + + bpf_token_put(token); + return 0; +} + +static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + u64 mask; + + BUILD_BUG_ON(__MAX_BPF_CMD >= 64); + mask = BIT_ULL(__MAX_BPF_CMD) - 1; + if ((token->allowed_cmds & mask) == mask) + seq_printf(m, "allowed_cmds:\tany\n"); + else + seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); + + BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); + mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1; + if ((token->allowed_maps & mask) == mask) + seq_printf(m, "allowed_maps:\tany\n"); + else + seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); + + BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); + mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1; + if ((token->allowed_progs & mask) == mask) + seq_printf(m, "allowed_progs:\tany\n"); + else + seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); + + BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); + mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1; + if ((token->allowed_attachs & mask) == mask) + seq_printf(m, "allowed_attachs:\tany\n"); + else + seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); +} + +#define BPF_TOKEN_INODE_NAME "bpf-token" + +static const struct inode_operations bpf_token_iops = { }; + +static const struct file_operations bpf_token_fops = { + .release = bpf_token_release, + .show_fdinfo = bpf_token_show_fdinfo, +}; + +int bpf_token_create(union bpf_attr *attr) +{ + struct bpf_mount_opts *mnt_opts; + struct bpf_token *token = NULL; + struct user_namespace *userns; + struct inode *inode; + struct file *file; + struct path path; + struct fd f; + umode_t mode; + int err, fd; + + f = fdget(attr->token_create.bpffs_fd); + if (!f.file) + return -EBADF; + + path = f.file->f_path; + path_get(&path); + fdput(f); + + if (path.dentry != path.mnt->mnt_sb->s_root) { + err = -EINVAL; + goto out_path; + } + if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { + err = -EINVAL; + goto out_path; + } + err = path_permission(&path, MAY_ACCESS); + if (err) + goto out_path; + + userns = path.dentry->d_sb->s_user_ns; + /* + * Enforce that creators of BPF tokens are in the same user + * namespace as the BPF FS instance. This makes reasoning about + * permissions a lot easier and we can always relax this later. + */ + if (current_user_ns() != userns) { + err = -EPERM; + goto out_path; + } + if (!ns_capable(userns, CAP_BPF)) { + err = -EPERM; + goto out_path; + } + + /* Creating BPF token in init_user_ns doesn't make much sense. */ + if (current_user_ns() == &init_user_ns) { + err = -EOPNOTSUPP; + goto out_path; + } + + mnt_opts = path.dentry->d_sb->s_fs_info; + if (mnt_opts->delegate_cmds == 0 && + mnt_opts->delegate_maps == 0 && + mnt_opts->delegate_progs == 0 && + mnt_opts->delegate_attachs == 0) { + err = -ENOENT; /* no BPF token delegation is set up */ + goto out_path; + } + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_path; + } + + inode->i_op = &bpf_token_iops; + inode->i_fop = &bpf_token_fops; + clear_nlink(inode); /* make sure it is unlinked */ + + file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); + if (IS_ERR(file)) { + iput(inode); + err = PTR_ERR(file); + goto out_path; + } + + token = kzalloc(sizeof(*token), GFP_USER); + if (!token) { + err = -ENOMEM; + goto out_file; + } + + atomic64_set(&token->refcnt, 1); + + /* remember bpffs owning userns for future ns_capable() checks */ + token->userns = get_user_ns(userns); + + token->allowed_cmds = mnt_opts->delegate_cmds; + token->allowed_maps = mnt_opts->delegate_maps; + token->allowed_progs = mnt_opts->delegate_progs; + token->allowed_attachs = mnt_opts->delegate_attachs; + + err = security_bpf_token_create(token, attr, &path); + if (err) + goto out_token; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + err = fd; + goto out_token; + } + + file->private_data = token; + fd_install(fd, file); + + path_put(&path); + return fd; + +out_token: + bpf_token_free(token); +out_file: + fput(file); +out_path: + path_put(&path); + return err; +} + +struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_token *token; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_token_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + token = f.file->private_data; + bpf_token_inc(token); + fdput(f); + + return token; +} + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + if (!token) + return false; + if (!(token->allowed_cmds & BIT_ULL(cmd))) + return false; + return security_bpf_token_cmd(token, cmd) == 0; +} + +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) +{ + if (!token || type >= __MAX_BPF_MAP_TYPE) + return false; + + return token->allowed_maps & BIT_ULL(type); +} + +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) + return false; + + return (token->allowed_progs & BIT_ULL(prog_type)) && + (token->allowed_attachs & BIT_ULL(attach_type)); +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d382f5ebe06c..db7599c59c78 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1014,7 +1014,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) { - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_enter_sleepable_recur : @@ -1029,7 +1029,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) { - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_exit_sleepable_recur : diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ddea9567f755..63749ad5ac6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -528,6 +528,21 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); } +static bool is_async_callback_calling_insn(struct bpf_insn *insn) +{ + return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); +} + +static bool is_may_goto_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; +} + +static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) +{ + return is_may_goto_insn(&env->prog->insnsi[insn_idx]); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -1155,6 +1170,12 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) stack->spilled_ptr.type == SCALAR_VALUE; } +static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) +{ + return stack->slot_type[0] == STACK_SPILL && + stack->spilled_ptr.type == SCALAR_VALUE; +} + /* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. @@ -1418,6 +1439,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; + dst_state->may_goto_depth = src->may_goto_depth; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -2264,8 +2286,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { /* * Clear type, off, and union(map_ptr, range) and @@ -2277,10 +2298,20 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->ref_obj_id = 0; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = !env->bpf_capable; + reg->precise = false; __mark_reg_unbounded(reg); } +/* Mark a register as having a completely unknown (scalar) value, + * initialize .precise as true when not bpf capable. + */ +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + __mark_reg_unknown_imprecise(reg); + reg->precise = !env->bpf_capable; +} + static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -4355,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: + case PTR_TO_ARENA: return true; default: return false; @@ -4380,20 +4412,6 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; } -static bool __is_scalar_unbounded(struct bpf_reg_state *reg) -{ - return tnum_is_unknown(reg->var_off) && - reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && - reg->umin_value == 0 && reg->umax_value == U64_MAX && - reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && - reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; -} - -static bool register_is_bounded(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); -} - static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -4403,6 +4421,18 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, + struct bpf_reg_state *src_reg) +{ + if (src_reg->type == SCALAR_VALUE && !src_reg->id && + !tnum_is_const(src_reg->var_off)) + /* Ensure that src_reg has a valid ID that will be copied to + * dst_reg and then will be used by find_equal_scalars() to + * propagate min/max range. + */ + src_reg->id = ++env->id_gen; +} + /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { @@ -4438,6 +4468,11 @@ static bool is_bpf_st_mem(struct bpf_insn *insn) return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM; } +static int get_reg_width(struct bpf_reg_state *reg) +{ + return fls64(reg->umax_value); +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -4487,13 +4522,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; mark_stack_slot_scratched(env, spi); - if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) { + if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { + bool reg_value_fits; + + reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size; + /* Make sure that reg had an ID to build a relation on spill. */ + if (reg_value_fits) + assign_scalar_id_before_mov(env, reg); save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ - if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + if (!reg_value_fits) state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && - insn->imm != 0 && env->bpf_capable) { + env->bpf_capable) { struct bpf_reg_state fake_reg = {}; __mark_reg_known(&fake_reg, insn->imm); @@ -4640,7 +4681,20 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return -EINVAL; } - /* Erase all spilled pointers. */ + /* If writing_zero and the spi slot contains a spill of value 0, + * maintain the spill type. + */ + if (writing_zero && *stype == STACK_SPILL && + is_spilled_scalar_reg(&state->stack[spi])) { + struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr; + + if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) { + zero_used = true; + continue; + } + } + + /* Erase all other spilled pointers. */ state->stack[spi].spilled_ptr.type = NOT_INIT; /* Update the slot type. */ @@ -4756,7 +4810,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno < 0) return 0; - if (!(off % BPF_REG_SIZE) && size == spill_size) { + if (size <= spill_size && + bpf_stack_narrow_access_ok(off, size, spill_size)) { /* The earlier check_reg_arg() has decided the * subreg_def for this insn. Save it first. */ @@ -4764,6 +4819,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; + + /* Break the relation on a narrowing fill. + * coerce_reg_to_size will adjust the boundaries. + */ + if (get_reg_width(reg) > size * BITS_PER_BYTE) + state->regs[dst_regno].id = 0; } else { int spill_cnt = 0, zero_cnt = 0; @@ -5211,6 +5272,11 @@ bad_type: return -EINVAL; } +static bool in_sleepable(struct bpf_verifier_env *env) +{ + return env->prog->sleepable; +} + /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() * can dereference RCU protected pointers and result is PTR_TRUSTED. */ @@ -5218,7 +5284,7 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || env->cur_state->active_lock.ptr || - !env->prog->aux->sleepable; + !in_sleepable(env); } /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ @@ -5763,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: pointer_desc = "xdp_sock "; break; + case PTR_TO_ARENA: + return 0; default: break; } @@ -5770,6 +5838,17 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) +{ + if (env->prog->jit_requested) + return round_up(stack_depth, 16); + + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + return round_up(max_t(u32, stack_depth, 1), 32); +} + /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. @@ -5813,10 +5892,7 @@ process_func: depth); return -EACCES; } - /* round up to 32-bytes, since this is granularity - * of interpreter stack size - */ - depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); + depth += round_up_stack_depth(env, subprog[idx].stack_depth); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); @@ -5910,7 +5986,7 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; @@ -6041,10 +6117,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already. */ - if (size < 4) { + if (size < 4) __mark_reg32_unbounded(reg); - reg_bounds_sync(reg); - } + + reg_bounds_sync(reg); } static void set_sext64_default_val(struct bpf_reg_state *reg, int size) @@ -6864,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_ARENA) { + if (t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str(env, reg->type)); @@ -8200,6 +8279,7 @@ found: switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: @@ -8334,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_ARENA: case SCALAR_VALUE: return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -9298,10 +9379,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL; } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* + * Can pass any value and the kernel won't crash, but + * only PTR_TO_ARENA or SCALAR make sense. Everything + * else is a bug in the bpf program. Point it out to + * the user at the verification time instead of + * run-time debug nightmare. + */ + if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + return -EINVAL; + } } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) return ret; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { + struct bpf_call_arg_meta meta; + int err; + + if (register_is_null(reg) && type_may_be_null(arg->arg_type)) + continue; + + memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ + err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + if (err) + return err; } else { bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", i, arg->arg_type); @@ -9377,9 +9482,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return -EFAULT; } - if (insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0 && - insn->imm == BPF_FUNC_timer_set_callback) { + if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer callbacks are async */ @@ -9438,6 +9541,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); + /* Only global subprogs cannot be called with a lock held. */ + if (env->cur_state->active_lock.ptr) { + verbose(env, "global function calls are not allowed while holding a lock,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -10094,7 +10204,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (!env->prog->aux->sleepable && fn->might_sleep) { + if (!in_sleepable(env) && fn->might_sleep) { verbose(env, "helper call might sleep in a non-sleepable prog\n"); return -EINVAL; } @@ -10124,7 +10234,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + if (in_sleepable(env) && is_storage_get_function(func_id)) env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } @@ -10620,24 +10730,6 @@ static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU_PROTECTED; } -static bool __kfunc_param_match_suffix(const struct btf *btf, - const struct btf_param *arg, - const char *suffix) -{ - int suffix_len = strlen(suffix), len; - const char *param_name; - - /* In the future, this can be ported to use BTF tagging */ - param_name = btf_name_by_offset(btf, arg->name_off); - if (str_is_empty(param_name)) - return false; - len = strlen(param_name); - if (len < suffix_len) - return false; - param_name += len - suffix_len; - return !strncmp(param_name, suffix, suffix_len); -} - static bool is_kfunc_arg_mem_size(const struct btf *btf, const struct btf_param *arg, const struct bpf_reg_state *reg) @@ -10648,7 +10740,7 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) return false; - return __kfunc_param_match_suffix(btf, arg, "__sz"); + return btf_param_match_suffix(btf, arg, "__sz"); } static bool is_kfunc_arg_const_mem_size(const struct btf *btf, @@ -10661,47 +10753,52 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) return false; - return __kfunc_param_match_suffix(btf, arg, "__szk"); + return btf_param_match_suffix(btf, arg, "__szk"); } static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__opt"); + return btf_param_match_suffix(btf, arg, "__opt"); } static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__k"); + return btf_param_match_suffix(btf, arg, "__k"); } static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__ign"); + return btf_param_match_suffix(btf, arg, "__ign"); +} + +static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__map"); } static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__alloc"); + return btf_param_match_suffix(btf, arg, "__alloc"); } static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__uninit"); + return btf_param_match_suffix(btf, arg, "__uninit"); } static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); + return btf_param_match_suffix(btf, arg, "__refcounted_kptr"); } static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__nullable"); + return btf_param_match_suffix(btf, arg, "__nullable"); } static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { - return __kfunc_param_match_suffix(btf, arg, "__str"); + return btf_param_match_suffix(btf, arg, "__str"); } static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, @@ -10848,6 +10945,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_RB_NODE, KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, + KF_ARG_PTR_TO_MAP, }; enum special_kfunc_type { @@ -10971,7 +11069,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -11001,6 +11099,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_const_str(meta->btf, &args[argno])) return KF_ARG_PTR_TO_CONST_STR; + if (is_kfunc_arg_map(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_MAP; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11483,7 +11584,7 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) return true; fallthrough; default: - return env->prog->aux->sleepable; + return in_sleepable(env); } } @@ -11601,6 +11702,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ switch (kf_arg_type) { case KF_ARG_PTR_TO_NULL: continue; + case KF_ARG_PTR_TO_MAP: case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -11817,6 +11919,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_MAP: + /* If argument has '__map' suffix expect 'struct bpf_map *' */ + ref_id = *reg2btf_ids[CONST_PTR_TO_MAP]; + ref_t = btf_type_by_id(btf_vmlinux, ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + fallthrough; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || @@ -12004,7 +12112,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } sleepable = is_kfunc_sleepable(&meta); - if (sleepable && !env->prog->aux->sleepable) { + if (sleepable && !in_sleepable(env)) { verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); return -EACCES; } @@ -12291,6 +12399,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_name); return -EFAULT; } + } else if (btf_type_is_void(ptr_type)) { + /* kfunc returning 'void *' is equivalent to returning scalar */ + mark_reg_unknown(env, regs, BPF_REG_0); } else if (!__btf_type_is_struct(ptr_type)) { if (!meta.r0_size) { __u32 sz; @@ -12828,6 +12939,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } switch (base_type(ptr_reg->type)) { + case PTR_TO_CTX: + case PTR_TO_MAP_VALUE: + case PTR_TO_MAP_KEY: + case PTR_TO_STACK: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET: + case PTR_TO_TP_BUFFER: + case PTR_TO_BTF_ID: + case PTR_TO_MEM: + case PTR_TO_BUF: + case PTR_TO_FUNC: + case CONST_PTR_TO_DYNPTR: + break; case PTR_TO_FLOW_KEYS: if (known) break; @@ -12837,16 +12961,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (known && smin_val == 0 && opcode == BPF_ADD) break; fallthrough; - case PTR_TO_PACKET_END: - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - case PTR_TO_TCP_SOCK: - case PTR_TO_XDP_SOCK: + default: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str(env, ptr_reg->type)); return -EACCES; - default: - break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. @@ -13753,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; src_reg = NULL; + + if (dst_reg->type == PTR_TO_ARENA) { + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (BPF_CLASS(insn->code) == BPF_ALU64) + /* + * 32-bit operations zero upper bits automatically. + * 64-bit operations need to be converted to 32. + */ + aux->needs_zext = true; + + /* Any arithmetic operations are allowed on arena pointers */ + return 0; + } + if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; else @@ -13870,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - if (BPF_CLASS(insn->code) == BPF_ALU) { - if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } } else { - if (insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) || insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -13907,20 +14041,18 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = regs + insn->src_reg; struct bpf_reg_state *dst_reg = regs + insn->dst_reg; - bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id && - !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { - if (insn->off == 0) { + if (insn->imm) { + /* off == BPF_ADDR_SPACE_CAST */ + mark_reg_unknown(env, regs, insn->dst_reg); + if (insn->imm == 1) /* cast from as(1) to as(0) */ + dst_reg->type = PTR_TO_ARENA; + } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg */ - if (need_id) - /* Assign src and dst registers the same ID - * that will be used by find_equal_scalars() - * to propagate min/max range. - */ - src_reg->id = ++env->id_gen; + assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; @@ -13935,8 +14067,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) bool no_sext; no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); - if (no_sext && need_id) - src_reg->id = ++env->id_gen; + if (no_sext) + assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; @@ -13956,10 +14088,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { if (insn->off == 0) { - bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; + bool is_src_reg_u32 = get_reg_width(src_reg) <= 32; - if (is_src_reg_u32 && need_id) - src_reg->id = ++env->id_gen; + if (is_src_reg_u32) + assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly @@ -13973,8 +14105,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: W1 = (s8, s16)W2 */ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); - if (no_sext && need_id) - src_reg->id = ++env->id_gen; + if (no_sext) + assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; @@ -14809,11 +14941,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; /* Only conditional jumps are expected to reach here. */ - if (opcode == BPF_JA || opcode > BPF_JSLE) { + if (opcode == BPF_JA || opcode > BPF_JCOND) { verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } + if (opcode == BPF_JCOND) { + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; + int idx = *insn_idx; + + if (insn->code != (BPF_JMP | BPF_JCOND) || + insn->src_reg != BPF_MAY_GOTO || + insn->dst_reg || insn->imm || insn->off == 0) { + verbose(env, "invalid may_goto off %d imm %d\n", + insn->off, insn->imm); + return -EINVAL; + } + prev_st = find_prev_entry(env, cur_st->parent, idx); + + /* branch out 'fallthrough' insn as a new state to explore */ + queued_st = push_stack(env, idx + 1, idx, false); + if (!queued_st) + return -ENOMEM; + + queued_st->may_goto_depth++; + if (prev_st) + widen_imprecise_scalars(env, prev_st, queued_st); + *insn_idx += insn->off; + return 0; + } + /* check src2 operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) @@ -15065,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { + if (map->map_type == BPF_MAP_TYPE_ARENA) { + __mark_reg_unknown(env, dst_reg); + return 0; + } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; WARN_ON_ONCE(map->max_entries != 1); @@ -15531,7 +15692,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) + if (is_async_callback_calling_insn(insn)) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new @@ -15597,6 +15758,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) default: /* conditional jump with two edges */ mark_prune_point(env, t); + if (is_may_goto_insn(insn)) + mark_force_checkpoint(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) @@ -16160,8 +16323,8 @@ static int check_btf_info(struct bpf_verifier_env *env, } /* check %cur's range satisfies %old's */ -static bool range_within(struct bpf_reg_state *old, - struct bpf_reg_state *cur) +static bool range_within(const struct bpf_reg_state *old, + const struct bpf_reg_state *cur) { return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && @@ -16325,21 +16488,28 @@ static bool regs_exact(const struct bpf_reg_state *rold, check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); } +enum exact_level { + NOT_EXACT, + EXACT, + RANGE_WITHIN +}; + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, + enum exact_level exact) { - if (exact) + if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (!(rold->live & REG_LIVE_READ)) + if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) /* explored state didn't use this */ return true; - if (rold->type == NOT_INIT) - /* explored state can't have used this */ - return true; - if (rcur->type == NOT_INIT) - return false; + if (rold->type == NOT_INIT) { + if (exact == NOT_EXACT || rcur->type == NOT_INIT) + /* explored state can't have used this */ + return true; + } /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general @@ -16374,7 +16544,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_scalar_ids(rold->id, rcur->id, idmap); } - if (!rold->precise) + if (!rold->precise && exact == NOT_EXACT) return true; /* Why check_ids() for scalar registers? * @@ -16442,13 +16612,53 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * the same stack frame, since fp-8 in foo != fp-8 in bar */ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; + case PTR_TO_ARENA: + return true; default: return regs_exact(rold, rcur, idmap); } } +static struct bpf_reg_state unbound_reg; + +static __init int unbound_reg_init(void) +{ + __mark_reg_unknown_imprecise(&unbound_reg); + unbound_reg.live |= REG_LIVE_READ; + return 0; +} +late_initcall(unbound_reg_init); + +static bool is_stack_all_misc(struct bpf_verifier_env *env, + struct bpf_stack_state *stack) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) { + if ((stack->slot_type[i] == STACK_MISC) || + (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack)) + continue; + return false; + } + + return true; +} + +static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, + struct bpf_stack_state *stack) +{ + if (is_spilled_scalar_reg64(stack)) + return &stack->spilled_ptr; + + if (is_stack_all_misc(env, stack)) + return &unbound_reg; + + return NULL; +} + static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) { int i, spi; @@ -16461,12 +16671,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (exact && + if (exact != NOT_EXACT && old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) return false; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) { + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) + && exact == NOT_EXACT) { i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; @@ -16485,6 +16696,20 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, if (i >= cur->allocated_stack) return false; + /* 64-bit scalar spill vs all slots MISC and vice versa. + * Load from all slots MISC produces unbound scalar. + * Construct a fake register for such stack and call + * regsafe() to ensure scalar ids are compared. + */ + old_reg = scalar_reg_for_stack(env, &old->stack[spi]); + cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]); + if (old_reg && cur_reg) { + if (!regsafe(env, old_reg, cur_reg, idmap, exact)) + return false; + i += BPF_REG_SIZE - 1; + continue; + } + /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true @@ -16598,7 +16823,7 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, bool exact) + struct bpf_func_state *cur, enum exact_level exact) { int i; @@ -16628,7 +16853,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env) static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur, - bool exact) + enum exact_level exact) { int i; @@ -17002,7 +17227,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, true)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; @@ -17025,15 +17250,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } goto skip_inf_loop_check; } + if (is_may_goto_insn_at(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + update_loop_entry(cur, &sl->state); + goto hit; + } + goto skip_inf_loop_check; + } if (calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, true)) + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur, false) && + states_equal(env, &sl->state, cur, EXACT) && !iter_active_depths_differ(&sl->state, cur) && + sl->state.may_goto_depth == cur->may_goto_depth && sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); @@ -17089,7 +17322,7 @@ skip_inf_loop_check: */ loop_entry = get_loop_entry(&sl->state); force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact)) { + if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) update_loop_entry(cur, loop_entry); hit: @@ -17259,6 +17492,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_ARENA: return false; default: return true; @@ -17541,7 +17775,6 @@ static int do_check(struct bpf_verifier_env *env) if (env->cur_state->active_lock.ptr) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_CALL) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { verbose(env, "function calls are not allowed while holding a lock\n"); @@ -17589,14 +17822,12 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } process_bpf_exit_full: - if (env->cur_state->active_lock.ptr && - !in_rbtree_lock_required_cb(env)) { + if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } - if (env->cur_state->active_rcu_lock && - !in_rbtree_lock_required_cb(env)) { + if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } @@ -17909,7 +18140,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } - if (prog->aux->sleepable) + if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: @@ -17925,6 +18156,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: case BPF_MAP_TYPE_CGRP_STORAGE: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: break; default: verbose(env, @@ -18094,7 +18328,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } - if (env->prog->aux->sleepable) + if (env->prog->sleepable) atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it @@ -18112,6 +18346,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) fdput(f); return -EBUSY; } + if (map->map_type == BPF_MAP_TYPE_ARENA) { + if (env->prog->aux->arena) { + verbose(env, "Only one arena per program\n"); + fdput(f); + return -EBUSY; + } + if (!env->allow_ptr_leaks || !env->bpf_capable) { + verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); + fdput(f); + return -EPERM; + } + if (!env->prog->jit_requested) { + verbose(env, "JIT is required to use arena\n"); + return -EOPNOTSUPP; + } + if (!bpf_jit_supports_arena()) { + verbose(env, "JIT doesn't support arena\n"); + return -EOPNOTSUPP; + } + env->prog->aux->arena = (void *)map; + if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { + verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + return -EINVAL; + } + } fdput(f); next_insn: @@ -18733,6 +18992,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog->aux->num_exentries++; } continue; + case PTR_TO_ARENA: + if (BPF_MODE(insn->code) == BPF_MEMSX) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; default: continue; } @@ -18918,13 +19185,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + func[i]->aux->arena = prog->aux->arena; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEM32 || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; + if ((BPF_CLASS(insn->code) == BPF_STX || + BPF_CLASS(insn->code) == BPF_ST) && + BPF_MODE(insn->code) == BPF_PROBE_MEM32) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19299,7 +19572,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, ret, cnt, delta = 0; + int i, ret, cnt, delta = 0, cur_subprog = 0; + struct bpf_subprog_info *subprogs = env->subprog_info; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_extra = 0; if (env->seen_exception && !env->exception_callback_subprog) { struct bpf_insn patch[] = { @@ -19319,7 +19595,22 @@ static int do_misc_fixups(struct bpf_verifier_env *env) mark_subprog_exc_cb(env, env->exception_callback_subprog); } - for (i = 0; i < insn_cnt; i++, insn++) { + for (i = 0; i < insn_cnt;) { + if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { + if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || + (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { + /* convert to 32-bit mov that clears upper 32-bit */ + insn->code = BPF_ALU | BPF_MOV | BPF_X; + /* clear off, so it's a normal 'wX = wY' from JIT pov */ + insn->off = 0; + } /* cast from as(0) to as(1) should be handled by JIT */ + goto next_insn; + } + + if (env->insn_aux_data[i + delta].needs_zext) + /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ + insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || @@ -19358,7 +19649,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ @@ -19378,7 +19669,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Rewrite pointer arithmetic to mitigate speculation attacks. */ @@ -19393,7 +19684,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) aux = &env->insn_aux_data[i + delta]; if (!aux->alu_state || aux->alu_state == BPF_ALU_NON_POINTER) - continue; + goto next_insn; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == @@ -19431,19 +19722,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; + } + + if (is_may_goto_insn(insn)) { + int stack_off = -stack_depth - 8; + + stack_depth_extra = 8; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); + cnt = 4; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; } if (insn->code != (BPF_JMP | BPF_CALL)) - continue; + goto next_insn; if (insn->src_reg == BPF_PSEUDO_CALL) - continue; + goto next_insn; if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); if (ret) return ret; if (cnt == 0) - continue; + goto next_insn; new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -19452,7 +19763,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } if (insn->imm == BPF_FUNC_get_route_realm) @@ -19500,11 +19811,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } insn->imm = ret + 1; - continue; + goto next_insn; } if (!bpf_map_ptr_unpriv(aux)) - continue; + goto next_insn; /* instead of changing every JIT dealing with tail_call * emit two extra insns: @@ -19533,7 +19844,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } if (insn->imm == BPF_FUNC_timer_set_callback) { @@ -19570,7 +19881,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!env->prog->aux->sleepable || + if (!in_sleepable(env) || env->insn_aux_data[i + delta].storage_get_func_atomic) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else @@ -19645,7 +19956,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, @@ -19676,31 +19987,31 @@ patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CALL_IMM(ops->map_lookup_elem); - continue; + goto next_insn; case BPF_FUNC_map_update_elem: insn->imm = BPF_CALL_IMM(ops->map_update_elem); - continue; + goto next_insn; case BPF_FUNC_map_delete_elem: insn->imm = BPF_CALL_IMM(ops->map_delete_elem); - continue; + goto next_insn; case BPF_FUNC_map_push_elem: insn->imm = BPF_CALL_IMM(ops->map_push_elem); - continue; + goto next_insn; case BPF_FUNC_map_pop_elem: insn->imm = BPF_CALL_IMM(ops->map_pop_elem); - continue; + goto next_insn; case BPF_FUNC_map_peek_elem: insn->imm = BPF_CALL_IMM(ops->map_peek_elem); - continue; + goto next_insn; case BPF_FUNC_redirect_map: insn->imm = BPF_CALL_IMM(ops->map_redirect); - continue; + goto next_insn; case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); - continue; + goto next_insn; case BPF_FUNC_map_lookup_percpu_elem: insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); - continue; + goto next_insn; } goto patch_call_imm; @@ -19728,7 +20039,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_arg inline. */ @@ -19753,7 +20064,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_ret inline. */ @@ -19781,7 +20092,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement get_func_arg_cnt inline. */ @@ -19796,7 +20107,7 @@ patch_map_ops_generic: env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_ip inline. */ @@ -19811,9 +20122,26 @@ patch_map_ops_generic: env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } + /* Implement bpf_kptr_xchg inline */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_kptr_xchg && + bpf_jit_supports_ptr_xchg()) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2); + insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0); + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed @@ -19826,6 +20154,40 @@ patch_call_imm: return -EFAULT; } insn->imm = fn->func - __bpf_call_base; +next_insn: + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + subprogs[cur_subprog].stack_extra = stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_extra = 0; + } + i++; + insn++; + } + + env->prog->aux->stack_depth = subprogs[0].stack_depth; + for (i = 0; i < env->subprog_cnt; i++) { + int subprog_start = subprogs[i].start; + int stack_slots = subprogs[i].stack_extra / 8; + + if (!stack_slots) + continue; + if (stack_slots > 1) { + verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + return -EFAULT; + } + + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, + -subprogs[i].stack_depth, BPF_MAX_LOOPS); + /* Copy first actual insn to preserve it */ + insn_buf[1] = env->prog->insnsi[subprog_start]; + + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + if (!new_prog) + return -ENOMEM; + env->prog = prog = new_prog; } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -20046,7 +20408,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->first_insn_idx = env->subprog_info[subprog].start; state->last_insn_idx = -1; - regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { const char *sub_name = subprog_name(env, subprog); @@ -20090,6 +20451,21 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { + reg->type = PTR_TO_BTF_ID; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->type |= PTR_MAYBE_NULL; + if (arg->arg_type & PTR_UNTRUSTED) + reg->type |= PTR_UNTRUSTED; + if (arg->arg_type & PTR_TRUSTED) + reg->type |= PTR_TRUSTED; + mark_reg_known_zero(env, regs, i); + reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */ + reg->btf_id = arg->btf_id; + reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* caller can pass either PTR_TO_ARENA or SCALAR */ + mark_reg_unknown(env, regs, i); } else { WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", i - BPF_REG_1, arg->arg_type); @@ -20238,10 +20614,12 @@ static void print_verification_stats(struct bpf_verifier_env *env) static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; + const struct bpf_struct_ops_desc *st_ops_desc; const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; u32 btf_id, member_idx; + struct btf *btf; const char *mname; if (!prog->gpl_compatible) { @@ -20249,15 +20627,30 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + if (!prog->aux->attach_btf_id) + return -ENOTSUPP; + + btf = prog->aux->attach_btf; + if (btf_is_module(btf)) { + /* Make sure st_ops is valid through the lifetime of env */ + env->attach_btf_mod = btf_try_get_module(btf); + if (!env->attach_btf_mod) { + verbose(env, "struct_ops module %s is not found\n", + btf_get_name(btf)); + return -ENOTSUPP; + } + } + btf_id = prog->aux->attach_btf_id; - st_ops = bpf_struct_ops_find(btf_id); - if (!st_ops) { + st_ops_desc = bpf_struct_ops_find(btf, btf_id); + if (!st_ops_desc) { verbose(env, "attach_btf_id %u is not a supported struct\n", btf_id); return -ENOTSUPP; } + st_ops = st_ops_desc->st_ops; - t = st_ops->type; + t = st_ops_desc->type; member_idx = prog->expected_attach_type; if (member_idx >= btf_type_vlen(t)) { verbose(env, "attach to invalid member idx %u of struct %s\n", @@ -20266,8 +20659,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } member = &btf_type_member(t)[member_idx]; - mname = btf_name_by_offset(btf_vmlinux, member->name_off); - func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type, + mname = btf_name_by_offset(btf, member->name_off); + func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); if (!func_proto) { verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n", @@ -20285,6 +20678,12 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + /* btf_ctx_access() used this to provide argument type info */ + prog->aux->ctx_arg_info = + st_ops_desc->arg_info[member_idx].info; + prog->aux->ctx_arg_info_size = + st_ops_desc->arg_info[member_idx].cnt; + prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; @@ -20542,7 +20941,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } } - if (prog->aux->sleepable) { + if (prog->sleepable) { ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -20653,14 +21052,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) u64 key; if (prog->type == BPF_PROG_TYPE_SYSCALL) { - if (prog->aux->sleepable) + if (prog->sleepable) /* attach_btf_id checked to be zero already */ return 0; verbose(env, "Syscall programs can only be sleepable\n"); return -EINVAL; } - if (prog->aux->sleepable && !can_be_sleepable(prog)) { + if (prog->sleepable && !can_be_sleepable(prog)) { verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } @@ -20769,7 +21168,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); - is_priv = bpf_capable(); + + env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token); + env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token); + env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token); + env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token); + env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF); bpf_get_btf_vmlinux(); @@ -20801,12 +21205,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = bpf_allow_ptr_leaks(); - env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->bypass_spec_v1 = bpf_bypass_spec_v1(); - env->bypass_spec_v4 = bpf_bypass_spec_v4(); - env->bpf_capable = bpf_capable(); - if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; @@ -20972,6 +21370,8 @@ err_release_maps: env->prog->expected_attach_type = 0; *prog = env->prog; + + module_put(env->attach_btf_mod); err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a8350d2d63e6..07e2284bb499 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -562,10 +562,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, cgroup_rstat_updated) BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) -BTF_SET8_END(bpf_rstat_kfunc_ids) +BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 4722b998a324..509ee703de15 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -40,6 +40,12 @@ CONFIG_UBSAN_ENUM=y CONFIG_UBSAN_SHIFT=y CONFIG_UBSAN_UNREACHABLE=y # +# Networking Debugging +# +CONFIG_NET_DEV_REFCNT_TRACKER=y +CONFIG_NET_NS_REFCNT_TRACKER=y +CONFIG_DEBUG_NET=y +# # Memory Debugging # # CONFIG_DEBUG_PAGEALLOC is not set diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 95a400f042b1..7a5bbfc024b7 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -44,7 +44,9 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_BOOL # CONFIG_UBSAN_ENUM # CONFIG_UBSAN_ALIGNMENT -CONFIG_UBSAN_SANITIZE_ALL=y + +# Sampling-based heap out-of-bounds and use-after-free detection. +CONFIG_KFENCE=y # Linked list integrity checking. CONFIG_LIST_HARDENED=y @@ -93,6 +95,3 @@ CONFIG_SYN_COOKIES=y # Attack surface reduction: Use the modern PTY interface (devpts) only. # CONFIG_LEGACY_PTYS is not set - -# Attack surface reduction: Use only modesetting video drivers. -# CONFIG_DRM_LEGACY is not set diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 75cd6a736d03..78b5dc7cee3a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,9 +11,14 @@ #include <linux/sizes.h> #include <linux/kexec.h> #include <linux/memory.h> +#include <linux/mm.h> #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/crash_core.h> +#include <linux/reboot.h> +#include <linux/btf.h> +#include <linux/objtool.h> #include <asm/page.h> #include <asm/sections.h> @@ -26,451 +31,130 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; -u32 *vmcoreinfo_note; - -/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -static unsigned char *vmcoreinfo_data_safecopy; - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ +#ifdef CONFIG_CRASH_DUMP - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) +int kimage_crash_copy_vmcoreinfo(struct kimage *image) { - char *cur = cmdline, *tmp; - unsigned long long total_mem = system_ram; + struct page *vmcoreinfo_page; + void *safecopy; + + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) + return 0; + if (image->type != KEXEC_TYPE_CRASH) + return 0; /* - * Firmware sometimes reserves some memory regions for its own use, - * so the system memory size is less than the actual physical memory - * size. Work around this by rounding up the total size to 128M, - * which is enough for most test cases. + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. */ - total_mem = roundup(total_mem, SZ_128M); - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= total_mem) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (total_mem >= start && total_mem < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } else - pr_info("crashkernel size resulted in zero bytes\n"); - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; } - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); return 0; } -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - return ck_cmdline; -} -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *suffix) +int kexec_should_crash(struct task_struct *p) { - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -ENOENT; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - * - * If crashkernel=,high|low is supported on architecture, non-NULL values - * should be passed to parameters 'low_size' and 'high'. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - unsigned long long *low_size, - bool *high) -{ - int ret; - - /* crashkernel=X[@offset] */ - ret = __parse_crashkernel(cmdline, system_ram, crash_size, - crash_base, NULL); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + if (crash_kexec_post_notifiers) + return 0; /* - * If non-NULL 'high' passed in and no normal crashkernel - * setting detected, try parsing crashkernel=,high|low. + * There are 4 panic() calls in make_task_dead() path, each of which + * corresponds to each of these 4 conditions. */ - if (high && ret == -ENOENT) { - ret = __parse_crashkernel(cmdline, 0, crash_size, - crash_base, suffix_tbl[SUFFIX_HIGH]); - if (ret || !*crash_size) - return -EINVAL; - - /* - * crashkernel=Y,low can be specified or not, but invalid value - * is not allowed. - */ - ret = __parse_crashkernel(cmdline, 0, low_size, - crash_base, suffix_tbl[SUFFIX_LOW]); - if (ret == -ENOENT) { - *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - ret = 0; - } else if (ret) { - return ret; - } - - *high = true; - } -#endif - if (!*crash_size) - ret = -EINVAL; - - return ret; + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; } -/* - * Add a dummy early_param handler to mark crashkernel= as a known command line - * parameter and suppress incorrect warnings in init/main.c. - */ -static int __init parse_crashkernel_dummy(char *arg) +int kexec_crash_loaded(void) { - return 0; + return !!kexec_crash_image; } -early_param("crashkernel", parse_crashkernel_dummy); +EXPORT_SYMBOL_GPL(kexec_crash_loaded); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -static int __init reserve_crashkernel_low(unsigned long long low_size) +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __noclone __crash_kexec(struct pt_regs *regs) { -#ifdef CONFIG_64BIT - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (kexec_trylock()) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + kexec_unlock(); } - - pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; -#endif - return 0; } +STACK_FRAME_NON_STANDARD(__crash_kexec); -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) +__bpf_kfunc void crash_kexec(struct pt_regs *regs) { - unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; - bool fixed_base = false; - - /* User specifies base address explicitly. */ - if (crash_base) { - fixed_base = true; - search_base = crash_base; - search_end = crash_base + crash_size; - } else if (high) { - search_base = CRASH_ADDR_LOW_MAX; - search_end = CRASH_ADDR_HIGH_MAX; - } + int old_cpu, this_cpu; -retry: - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, - search_base, search_end); - if (!crash_base) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel reservation failed - memory is in use.\n"); - return; - } + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - if (!high && search_end == CRASH_ADDR_LOW_MAX) { - search_end = CRASH_ADDR_HIGH_MAX; - search_base = CRASH_ADDR_LOW_MAX; - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; - } + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. */ - if (high && search_end == CRASH_ADDR_HIGH_MAX) { - search_end = CRASH_ADDR_LOW_MAX; - search_base = 0; - goto retry; - } - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - - if ((crash_base >= CRASH_ADDR_LOW_MAX) && - crash_low_size && reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; + atomic_set(&panic_cpu, PANIC_CPU_INVALID); } - - pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - /* - * The crashkernel memory will be removed from the kernel linear - * map. Inform kmemleak so that it won't try to access it. - */ - kmemleak_ignore_phys(crash_base); - if (crashk_low_res.end) - kmemleak_ignore_phys(crashk_low_res.start); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; } -static __init int insert_crashkernel_resources(void) +static inline resource_size_t crash_resource_size(const struct resource *res) { - if (crashk_res.start < crashk_res.end) - insert_resource(&iomem_resource, &crashk_res); + return !res->end ? 0 : resource_size(res); +} + - if (crashk_low_res.start < crashk_low_res.end) - insert_resource(&iomem_resource, &crashk_low_res); - return 0; -} -early_initcall(insert_crashkernel_resources); -#endif int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) @@ -633,205 +317,129 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len) +ssize_t crash_get_memory_size(void) { - struct elf_note *note = (struct elf_note *)buf; - - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); - memcpy(buf, name, note->n_namesz); - buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); - memcpy(buf, data, data_len); - buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); - - return buf; -} + ssize_t size = 0; -void final_note(Elf_Word *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} + if (!kexec_trylock()) + return -EBUSY; -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); + kexec_unlock(); + return size; } -void crash_update_vmcoreinfo_safecopy(void *ptr) +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) { - if (ptr) - memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + struct resource *ram_res; - vmcoreinfo_data_safecopy = ptr; -} + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; -void crash_save_vmcoreinfo(void) -{ - if (!vmcoreinfo_note) - return; + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } - /* Use the safe copy to generate vmcoreinfo note if have */ - if (vmcoreinfo_data_safecopy) - vmcoreinfo_data = vmcoreinfo_data_safecopy; + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); - vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); - update_vmcoreinfo_note(); + return 0; } -void vmcoreinfo_append_str(const char *fmt, ...) +int crash_shrink_memory(unsigned long new_size) { - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; - - WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, - "vmcoreinfo data exceeds allocated size, truncating"); -} + int ret = 0; + unsigned long old_size, low_size; -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} + if (!kexec_trylock()) + return -EBUSY; -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa(vmcoreinfo_note); -} -EXPORT_SYMBOL(paddr_vmcoreinfo_note); + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } -static int __init crash_save_vmcoreinfo_init(void) -{ - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); - if (!vmcoreinfo_data) { - pr_warn("Memory allocation for vmcoreinfo_data failed\n"); - return -ENOMEM; + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; } - vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, - GFP_KERNEL | __GFP_ZERO); - if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); - vmcoreinfo_data = NULL; - pr_warn("Memory allocation for vmcoreinfo_note failed\n"); - return -ENOMEM; + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; + + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); } - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_BUILD_ID(); - VMCOREINFO_PAGESIZE(PAGE_SIZE); + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_OFFSET(uts_namespace, name); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); +unlock: + kexec_unlock(); + return ret; +} -#ifndef CONFIG_NUMA - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL_ARRAY(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); - VMCOREINFO_NUMBER(SECTION_SIZE_BITS); - VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLATMEM - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); - log_buf_vmcoreinfo_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); -#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); -#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) - VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; -#ifdef CONFIG_KALLSYMS - VMCOREINFO_SYMBOL(kallsyms_names); - VMCOREINFO_SYMBOL(kallsyms_num_syms); - VMCOREINFO_SYMBOL(kallsyms_token_table); - VMCOREINFO_SYMBOL(kallsyms_token_index); -#ifdef CONFIG_KALLSYMS_BASE_RELATIVE - VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); -#else - VMCOREINFO_SYMBOL(kallsyms_addresses); -#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ -#endif /* CONFIG_KALLSYMS */ - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; - return 0; + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.common.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); } -subsys_initcall(crash_save_vmcoreinfo_init); + static int __init crash_notes_memory_init(void) { @@ -866,6 +474,8 @@ static int __init crash_notes_memory_init(void) } subsys_initcall(crash_notes_memory_init); +#endif /*CONFIG_CRASH_DUMP*/ + #ifdef CONFIG_CRASH_HOTPLUG #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c new file mode 100644 index 000000000000..bbb6c3cb00e4 --- /dev/null +++ b/kernel/crash_reserve.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + */ + +#include <linux/buildid.h> +#include <linux/init.h> +#include <linux/utsname.h> +#include <linux/vmalloc.h> +#include <linux/sizes.h> +#include <linux/kexec.h> +#include <linux/memory.h> +#include <linux/cpuhotplug.h> +#include <linux/memblock.h> +#include <linux/kexec.h> +#include <linux/kmemleak.h> + +#include <asm/page.h> +#include <asm/sections.h> + +#include <crypto/sha1.h> + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= total_mem) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (total_mem >= start && total_mem < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } else + pr_info("crashkernel size resulted in zero bytes\n"); + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + if (!ck_cmdline) + return -ENOENT; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) +{ + int ret; + + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; +} + +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); +#endif diff --git a/kernel/cred.c b/kernel/cred.c index c033a201c808..075cfa7c896f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -606,8 +606,8 @@ int set_cred_ucounts(struct cred *new) void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + cred_jar = KMEM_CACHE(cred, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } /** diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index f005c66f378c..055da410ac71 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -37,12 +37,6 @@ #define pr_fmt(fmt) "cma: " fmt -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - #include <asm/page.h> #include <linux/memblock.h> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 98b2e192fd69..4d543b1e9d57 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -286,7 +286,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, } else { ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) - goto out_free_pages; + goto out_leak_pages; } memset(ret, 0, size); @@ -307,6 +307,8 @@ out_encrypt_pages: out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; +out_leak_pages: + return NULL; } void dma_direct_free(struct device *dev, size_t size, @@ -367,12 +369,11 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) - goto out_free_pages; + goto out_leak_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; -out_free_pages: - __dma_direct_free_pages(dev, page, size); +out_leak_pages: return NULL; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b079a9a8e087..86fe172b5958 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -956,6 +956,28 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) } #endif /* CONFIG_DEBUG_FS */ +#ifdef CONFIG_SWIOTLB_DYNAMIC +#ifdef CONFIG_DEBUG_FS +static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots) +{ + atomic_long_add(nslots, &mem->transient_nslabs); +} + +static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots) +{ + atomic_long_sub(nslots, &mem->transient_nslabs); +} + +#else /* !CONFIG_DEBUG_FS */ +static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_SWIOTLB_DYNAMIC */ + /** * swiotlb_search_pool_area() - search one memory area in one pool * @dev: Device which maps the buffer. @@ -981,8 +1003,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int iotlb_align_mask = - dma_get_min_align_mask(dev) | alloc_align_mask; + unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, slots_checked, count = 0, i; @@ -994,18 +1015,29 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool BUG_ON(area_index >= pool->nareas); /* - * For allocations of PAGE_SIZE or larger only look for page aligned - * allocations. + * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be + * page-aligned in the absence of any other alignment requirements. + * 'alloc_align_mask' was later introduced to specify the alignment + * explicitly, however this is passed as zero for streaming mappings + * and so we preserve the old behaviour there in case any drivers are + * relying on it. */ - if (alloc_size >= PAGE_SIZE) - iotlb_align_mask |= ~PAGE_MASK; - iotlb_align_mask &= ~(IO_TLB_SIZE - 1); + if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE) + alloc_align_mask = PAGE_SIZE - 1; + + /* + * Ensure that the allocation is at least slot-aligned and update + * 'iotlb_align_mask' to ignore bits that will be preserved when + * offsetting into the allocation. + */ + alloc_align_mask |= (IO_TLB_SIZE - 1); + iotlb_align_mask &= ~alloc_align_mask; /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. */ - stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; + stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask)); spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) @@ -1015,11 +1047,14 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool index = area->index; for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { + phys_addr_t tlb_addr; + slot_index = slot_base + index; + tlb_addr = slot_addr(tbl_dma_addr, slot_index); - if (orig_addr && - (slot_addr(tbl_dma_addr, slot_index) & - iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { + if ((tlb_addr & alloc_align_mask) || + (orig_addr && (tlb_addr & iotlb_align_mask) != + (orig_addr & iotlb_align_mask))) { index = wrap_area_index(pool, index + 1); slots_checked++; continue; @@ -1170,6 +1205,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_add_rcu(&pool->node, &dev->dma_io_tlb_pools); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); + inc_transient_used(mem, pool->nslabs); found: WRITE_ONCE(dev->dma_uses_io_tlb, true); @@ -1415,6 +1451,7 @@ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) dec_used(dev->dma_io_tlb_mem, pool->nslabs); swiotlb_del_pool(dev, pool); + dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs); return true; } @@ -1557,6 +1594,23 @@ phys_addr_t default_swiotlb_limit(void) } #ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_SWIOTLB_DYNAMIC +static unsigned long mem_transient_used(struct io_tlb_mem *mem) +{ + return atomic_long_read(&mem->transient_nslabs); +} + +static int io_tlb_transient_used_get(void *data, u64 *val) +{ + struct io_tlb_mem *mem = data; + + *val = mem_transient_used(mem); + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get, + NULL, "%llu\n"); +#endif /* CONFIG_SWIOTLB_DYNAMIC */ static int io_tlb_used_get(void *data, u64 *val) { @@ -1605,6 +1659,11 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, &fops_io_tlb_used); debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); +#ifdef CONFIG_SWIOTLB_DYNAMIC + atomic_long_set(&mem->transient_nslabs, 0); + debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs, + mem, &fops_io_tlb_transient_used); +#endif } static int __init swiotlb_create_default_debugfs(void) @@ -1631,16 +1690,24 @@ struct page *swiotlb_alloc(struct device *dev, size_t size) struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; phys_addr_t tlb_addr; + unsigned int align; int index; if (!mem) return NULL; - index = swiotlb_find_slots(dev, 0, size, 0, &pool); + align = (1 << (get_order(size) + PAGE_SHIFT)) - 1; + index = swiotlb_find_slots(dev, 0, size, align, &pool); if (index == -1) return NULL; tlb_addr = slot_addr(pool->start, index); + if (unlikely(!PAGE_ALIGNED(tlb_addr))) { + dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n", + &tlb_addr); + swiotlb_release_slots(dev, tlb_addr); + return NULL; + } return pfn_to_page(PFN_DOWN(tlb_addr)); } diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c index 92da32275af5..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/elfcorehdr.c diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 88cb3c88aaa5..90843cc38588 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -57,8 +57,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } syscall_enter_audit(regs, syscall); diff --git a/kernel/events/core.c b/kernel/events/core.c index f0f0f71213a1..724e6d7e128f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9302,10 +9302,6 @@ void perf_event_bpf_event(struct bpf_prog *prog, { struct perf_bpf_event bpf_event; - if (type <= PERF_BPF_EVENT_UNKNOWN || - type >= PERF_BPF_EVENT_MAX) - return; - switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: @@ -9313,7 +9309,7 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_event_bpf_emit_ksymbols(prog, type); break; default: - break; + return; } if (!atomic_read(&nr_bpf_events)) @@ -10557,7 +10553,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; - if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) /* only uprobe programs are allowed to be sleepable */ return -EINVAL; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 929e98c62965..e4834d23e1d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, dec_mm_counter(mm, MM_ANONPAGES); if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9a24574988d2..b2fc2727d654 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index b4cac76ea5e9..8a689b4ff4f9 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -89,7 +89,6 @@ static struct test_item test_items[] = { ITEM_DATA(kallsyms_test_var_data_static), ITEM_DATA(kallsyms_test_var_bss), ITEM_DATA(kallsyms_test_var_data), - ITEM_DATA(vmap_area_list), #endif }; diff --git a/kernel/kexec.c b/kernel/kexec.c index 8f35a5a42af8..bab542fc1463 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Verify we have a valid entry point */ if ((entry < phys_to_boot_phys(crashk_res.start)) || (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } +#endif /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; memcpy(image->segment, segments, nr_segments * sizeof(*segments)); +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = sanity_check_segment_list(image); if (ret) @@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (nr_segments == 0) { /* Uninstall image */ @@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kimage_free(image); out_unlock: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d08fc7b5db97..0e96f6b24344 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -54,30 +54,6 @@ bool kexec_in_progress = false; bool kexec_file_dbg_print; -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in make_task_dead() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -int kexec_crash_loaded(void) -{ - return !!kexec_crash_image; -} -EXPORT_SYMBOL_GPL(kexec_crash_loaded); - /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image) if (total_pages > nr_pages / 2) return -EINVAL; +#ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image) return -EADDRNOTAVAIL; } } +#endif return 0; } @@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; } +#ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; } +#endif struct page *kimage_alloc_control_pages(struct kimage *image, @@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image, case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; +#endif } return pages; } -int kimage_crash_copy_vmcoreinfo(struct kimage *image) -{ - struct page *vmcoreinfo_page; - void *safecopy; - - if (image->type != KEXEC_TYPE_CRASH) - return 0; - - /* - * For kdump, allocate one vmcoreinfo safe copy from the - * crash memory. as we have arch_kexec_protect_crashkres() - * after kexec syscall, we naturally protect it from write - * (even read) access under kernel direct mapping. But on - * the other hand, we still need to operate it when crash - * happens to generate vmcoreinfo note, hereby we rely on - * vmap for this purpose. - */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { - pr_warn("Could not allocate vmcoreinfo buffer\n"); - return -ENOMEM; - } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); - if (!safecopy) { - pr_warn("Could not vmap vmcoreinfo buffer\n"); - return -ENOMEM; - } - - image->vmcoreinfo_data_copy = safecopy; - crash_update_vmcoreinfo_safecopy(safecopy); - - return 0; -} - static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -603,10 +551,12 @@ void kimage_free(struct kimage *image) if (!image) return; +#ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } +#endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { @@ -800,22 +750,24 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } - ubytes -= uchunk; maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; mbytes -= mchunk; cond_resched(); @@ -824,6 +776,7 @@ out: return result; } +#ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -866,11 +819,18 @@ static int kimage_load_crash_segment(struct kimage *image, memset(ptr + uchunk, 0, mchunk - uchunk); } - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); @@ -878,12 +838,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = -EFAULT; goto out; } - ubytes -= uchunk; maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; mbytes -= mchunk; cond_resched(); @@ -891,6 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, out: return result; } +#endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) @@ -901,9 +857,11 @@ int kimage_load_segment(struct kimage *image, case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; +#endif } return result; @@ -1028,186 +986,6 @@ bool kexec_load_permitted(int kexec_image_type) } /* - * No panic_cpu check version of crash_kexec(). This function is called - * only when panic_cpu holds the current CPU number; this is the only CPU - * which processes crash_kexec routines. - */ -void __noclone __crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_lock here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (kexec_trylock()) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - kexec_unlock(); - } -} -STACK_FRAME_NON_STANDARD(__crash_kexec); - -__bpf_kfunc void crash_kexec(struct pt_regs *regs) -{ - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { - /* This is the 1st CPU which comes here, so go ahead. */ - __crash_kexec(regs); - - /* - * Reset panic_cpu to allow another panic()/crash_kexec() - * call. - */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); - } -} - -static inline resource_size_t crash_resource_size(const struct resource *res) -{ - return !res->end ? 0 : resource_size(res); -} - -ssize_t crash_get_memory_size(void) -{ - ssize_t size = 0; - - if (!kexec_trylock()) - return -EBUSY; - - size += crash_resource_size(&crashk_res); - size += crash_resource_size(&crashk_low_res); - - kexec_unlock(); - return size; -} - -static int __crash_shrink_memory(struct resource *old_res, - unsigned long new_size) -{ - struct resource *ram_res; - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) - return -ENOMEM; - - ram_res->start = old_res->start + new_size; - ram_res->end = old_res->end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; - - if (!new_size) { - release_resource(old_res); - old_res->start = 0; - old_res->end = 0; - } else { - crashk_res.end = ram_res->start - 1; - } - - crash_free_reserved_phys_range(ram_res->start, ram_res->end); - insert_resource(&iomem_resource, ram_res); - - return 0; -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long old_size, low_size; - - if (!kexec_trylock()) - return -EBUSY; - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - - low_size = crash_resource_size(&crashk_low_res); - old_size = crash_resource_size(&crashk_res) + low_size; - new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - /* - * (low_size > new_size) implies that low_size is greater than zero. - * This also means that if low_size is zero, the else branch is taken. - * - * If low_size is greater than 0, (low_size > new_size) indicates that - * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res - * needs to be shrunken. - */ - if (low_size > new_size) { - ret = __crash_shrink_memory(&crashk_res, 0); - if (ret) - goto unlock; - - ret = __crash_shrink_memory(&crashk_low_res, new_size); - } else { - ret = __crash_shrink_memory(&crashk_res, new_size - low_size); - } - - /* Swap crashk_res and crashk_low_res if needed */ - if (!crashk_res.end && crashk_low_res.end) { - crashk_res.start = crashk_low_res.start; - crashk_res.end = crashk_low_res.end; - release_resource(&crashk_low_res); - crashk_low_res.start = 0; - crashk_low_res.end = 0; - insert_resource(&iomem_resource, &crashk_res); - } - -unlock: - kexec_unlock(); - return ret; -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.common.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -/* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bef2f6f2571b..2d1db05fbf04 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kexec_unlock(); kimage_free(image); @@ -535,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, phys_addr_t mstart, mend; struct resource res = { }; +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return func(&crashk_res, kbuf); +#endif /* * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See @@ -595,12 +602,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, static int kexec_walk_resources(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); - else if (kbuf->top_down) +#endif + if (kbuf->top_down) return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 74da1409cd14..2595defe8c0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -4,6 +4,8 @@ #include <linux/kexec.h> +struct kexec_segment; + struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1d4bc493b2f4..495b69a71a5d 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum); + return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); @@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_loaded); +#ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -152,9 +153,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -177,7 +179,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -262,10 +264,12 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif -#ifdef CONFIG_CRASH_CORE +#endif +#ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 0ea1b2970a23..c3ced519e14b 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -362,8 +362,7 @@ config MODPROBE_PATH userspace can still load modules explicitly). config TRIM_UNUSED_KSYMS - bool "Trim unused exported kernel symbols" if EXPERT - depends on !COMPILE_TEST + bool "Trim unused exported kernel symbols" help The kernel and some modules make many symbols available for other modules to use via EXPORT_SYMBOL() and variants. Depending diff --git a/kernel/module/internal.h b/kernel/module/internal.h index c8b7b4dcf782..2ebece8a789f 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -322,9 +322,9 @@ static inline struct module *mod_find(unsigned long addr, struct mod_tree_root * } #endif /* CONFIG_MODULES_TREE_LOOKUP */ -void module_enable_ro(const struct module *mod, bool after_init); -void module_enable_nx(const struct module *mod); -void module_enable_x(const struct module *mod); +int module_enable_rodata_ro(const struct module *mod, bool after_init); +int module_enable_data_nx(const struct module *mod); +int module_enable_text_rox(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod); diff --git a/kernel/module/main.c b/kernel/module/main.c index 36681911c05a..e1e8a7a9d6c1 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w) } } +void flush_module_init_free_work(void) +{ + flush_work(&init_free_wq); +} + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "module." /* Default value for module->async_probe_requested */ @@ -2571,7 +2576,9 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif - module_enable_ro(mod, true); + ret = module_enable_rodata_ro(mod, true); + if (ret) + goto fail_mutex_unlock; mod_tree_remove_init(mod); module_arch_freeing_init(mod); for_class_mod_mem_type(type, init) { @@ -2593,8 +2600,8 @@ static noinline int do_init_module(struct module *mod) * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to - * be cleaned up needs to sync with the queued work - ie - * rcu_barrier() + * be cleaned up needs to sync with the queued work by invoking + * flush_module_init_free_work(). */ if (llist_add(&freeinit->node, &init_free_list)) schedule_work(&init_free_wq); @@ -2609,6 +2616,8 @@ static noinline int do_init_module(struct module *mod) return 0; +fail_mutex_unlock: + mutex_unlock(&module_mutex); fail_free_freeinit: kfree(freeinit); fail: @@ -2736,9 +2745,15 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); module_cfi_finalize(info->hdr, info->sechdrs, mod); - module_enable_ro(mod, false); - module_enable_nx(mod); - module_enable_x(mod); + err = module_enable_rodata_ro(mod, false); + if (err) + goto out_strict_rwx; + err = module_enable_data_nx(mod); + if (err) + goto out_strict_rwx; + err = module_enable_text_rox(mod); + if (err) + goto out_strict_rwx; /* * Mark state as coming so strong_try_module_get() ignores us, @@ -2749,6 +2764,8 @@ static int complete_formation(struct module *mod, struct load_info *info) return 0; +out_strict_rwx: + module_bug_cleanup(mod); out: mutex_unlock(&module_mutex); return err; diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index a2b656b4e3d2..c45caa4690e5 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -11,13 +11,16 @@ #include <linux/set_memory.h> #include "internal.h" -static void module_set_memory(const struct module *mod, enum mod_mem_type type, - int (*set_memory)(unsigned long start, int num_pages)) +static int module_set_memory(const struct module *mod, enum mod_mem_type type, + int (*set_memory)(unsigned long start, int num_pages)) { const struct module_memory *mod_mem = &mod->mem[type]; + if (!mod_mem->base) + return 0; + set_vm_flush_reset_perms(mod_mem->base); - set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT); + return set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT); } /* @@ -26,37 +29,53 @@ static void module_set_memory(const struct module *mod, enum mod_mem_type type, * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we * are strict. */ -void module_enable_x(const struct module *mod) +int module_enable_text_rox(const struct module *mod) { - for_class_mod_mem_type(type, text) - module_set_memory(mod, type, set_memory_x); + for_class_mod_mem_type(type, text) { + int ret; + + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + ret = module_set_memory(mod, type, set_memory_rox); + else + ret = module_set_memory(mod, type, set_memory_x); + if (ret) + return ret; + } + return 0; } -void module_enable_ro(const struct module *mod, bool after_init) +int module_enable_rodata_ro(const struct module *mod, bool after_init) { - if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - return; -#ifdef CONFIG_STRICT_MODULE_RWX - if (!rodata_enabled) - return; -#endif + int ret; + + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled) + return 0; - module_set_memory(mod, MOD_TEXT, set_memory_ro); - module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro); - module_set_memory(mod, MOD_RODATA, set_memory_ro); - module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro); + ret = module_set_memory(mod, MOD_RODATA, set_memory_ro); + if (ret) + return ret; + ret = module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro); + if (ret) + return ret; if (after_init) - module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); + return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); + + return 0; } -void module_enable_nx(const struct module *mod) +int module_enable_data_nx(const struct module *mod) { if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - return; + return 0; - for_class_mod_mem_type(type, data) - module_set_memory(mod, type, set_memory_nx); + for_class_mod_mem_type(type, data) { + int ret = module_set_memory(mod, type, set_memory_nx); + + if (ret) + return ret; + } + return 0; } int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/kernel/padata.c b/kernel/padata.c index 179fb1518070..e3f639ff1670 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) struct padata_work my_work, *pw; struct padata_mt_job_state ps; LIST_HEAD(works); - int nworks; + int nworks, nid; + static atomic_t last_used_nid __initdata; if (job->size == 0) return; @@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = roundup(ps.chunk_size, job->align); list_for_each_entry(pw, &works, pw_list) - queue_work(system_unbound_wq, &pw->pw_work); + if (job->numa_aware) { + int old_node = atomic_read(&last_used_nid); + + do { + nid = next_node_in(old_node, node_states[N_CPU]); + } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid)); + queue_work_node(nid, system_unbound_wq, &pw->pw_work); + } else { + queue_work(system_unbound_wq, &pw->pw_work); + } /* Use the current thread, which saves starting a workqueue worker. */ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); diff --git a/kernel/panic.c b/kernel/panic.c index 2807639aab51..747c3f3d289a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 #define PANIC_PRINT_ALL_CPU_BT 0x00000040 +#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -227,6 +228,9 @@ static void panic_print_sys_info(bool console_flush) if (panic_print & PANIC_PRINT_FTRACE_INFO) ftrace_dump(DUMP_ALL); + + if (panic_print & PANIC_PRINT_BLOCKED_TASKS) + show_state_filter(TASK_UNINTERRUPTIBLE); } void check_panic_on_warn(const char *origin) @@ -446,6 +450,14 @@ void panic(const char *fmt, ...) /* Do not scroll important messages printed above */ suppress_printk = 1; + + /* + * The final messages may not have been printed if in a context that + * defers printing (such as NMI) and irq_work is not available. + * Explicitly flush the kernel log buffer one last time. + */ + console_flush_on_panic(CONSOLE_FLUSH_PENDING); + local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -666,8 +678,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint, pr_warn("WARNING: CPU: %d PID: %d at %pS\n", raw_smp_processor_id(), current->pid, caller); +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif if (args) vprintk(args->fmt, args->args); +#pragma GCC diagnostic pop print_modules(); diff --git a/kernel/pid.c b/kernel/pid.c index 99a0c5eb24b8..da76ed1873f7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -62,17 +62,13 @@ struct pid init_struct_pid = { int pid_max = PID_MAX_DEFAULT; -#define RESERVED_PIDS 300 - int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#ifdef CONFIG_FS_PID /* * Pseudo filesystems start inode numbering after one. We use Reserved * PIDs as a natural offset. */ static u64 pidfs_ino = RESERVED_PIDS; -#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -280,10 +276,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; -#ifdef CONFIG_FS_PID pid->stashed = NULL; pid->ino = ++pidfs_ino; -#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4b31629c5be4..afce8130d8b9 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -39,9 +39,9 @@ config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE select HIBERNATE_CALLBACKS - select LZO_COMPRESS - select LZO_DECOMPRESS select CRC32 + select CRYPTO + select CRYPTO_LZO help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -92,6 +92,28 @@ config HIBERNATION_SNAPSHOT_DEV If in doubt, say Y. +choice + prompt "Default compressor" + default HIBERNATION_COMP_LZO + depends on HIBERNATION + +config HIBERNATION_COMP_LZO + bool "lzo" + depends on CRYPTO_LZO + +config HIBERNATION_COMP_LZ4 + bool "lz4" + depends on CRYPTO_LZ4 + +endchoice + +config HIBERNATION_DEF_COMP + string + default "lzo" if HIBERNATION_COMP_LZO + default "lz4" if HIBERNATION_COMP_LZ4 + help + Default compressor to be used for hibernation. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7b44f5b89fa1..9e1c9aa399ea 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -23,6 +23,12 @@ */ static DEFINE_MUTEX(em_pd_mutex); +static void em_cpufreq_update_efficiencies(struct device *dev, + struct em_perf_state *table); +static void em_check_capacity_update(void); +static void em_update_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn); + static bool _is_cpu_device(struct device *dev) { return (dev->bus == &cpu_subsys); @@ -31,19 +37,65 @@ static bool _is_cpu_device(struct device *dev) #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) +struct em_dbg_info { + struct em_perf_domain *pd; + int ps_id; +}; + +#define DEFINE_EM_DBG_SHOW(name, fname) \ +static int em_debug_##fname##_show(struct seq_file *s, void *unused) \ +{ \ + struct em_dbg_info *em_dbg = s->private; \ + struct em_perf_state *table; \ + unsigned long val; \ + \ + rcu_read_lock(); \ + table = em_perf_state_from_pd(em_dbg->pd); \ + val = table[em_dbg->ps_id].name; \ + rcu_read_unlock(); \ + \ + seq_printf(s, "%lu\n", val); \ + return 0; \ +} \ +DEFINE_SHOW_ATTRIBUTE(em_debug_##fname) + +DEFINE_EM_DBG_SHOW(frequency, frequency); +DEFINE_EM_DBG_SHOW(power, power); +DEFINE_EM_DBG_SHOW(cost, cost); +DEFINE_EM_DBG_SHOW(performance, performance); +DEFINE_EM_DBG_SHOW(flags, inefficiency); + +static void em_debug_create_ps(struct em_perf_domain *em_pd, + struct em_dbg_info *em_dbg, int i, + struct dentry *pd) { + struct em_perf_state *table; + unsigned long freq; struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "ps:%lu", ps->frequency); + em_dbg[i].pd = em_pd; + em_dbg[i].ps_id = i; + + rcu_read_lock(); + table = em_perf_state_from_pd(em_pd); + freq = table[i].frequency; + rcu_read_unlock(); + + snprintf(name, sizeof(name), "ps:%lu", freq); /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &ps->frequency); - debugfs_create_ulong("power", 0444, d, &ps->power); - debugfs_create_ulong("cost", 0444, d, &ps->cost); - debugfs_create_ulong("inefficient", 0444, d, &ps->flags); + debugfs_create_file("frequency", 0444, d, &em_dbg[i], + &em_debug_frequency_fops); + debugfs_create_file("power", 0444, d, &em_dbg[i], + &em_debug_power_fops); + debugfs_create_file("cost", 0444, d, &em_dbg[i], + &em_debug_cost_fops); + debugfs_create_file("performance", 0444, d, &em_dbg[i], + &em_debug_performance_fops); + debugfs_create_file("inefficient", 0444, d, &em_dbg[i], + &em_debug_inefficiency_fops); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -66,6 +118,7 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { + struct em_dbg_info *em_dbg; struct dentry *d; int i; @@ -79,9 +132,14 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("flags", 0444, d, dev->em_pd, &em_debug_flags_fops); + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, + sizeof(*em_dbg), GFP_KERNEL); + if (!em_dbg) + return; + /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) - em_debug_create_ps(&dev->em_pd->table[i], d); + em_debug_create_ps(dev->em_pd, em_dbg, i, d); } @@ -103,18 +161,192 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif +static void em_destroy_table_rcu(struct rcu_head *rp) +{ + struct em_perf_table __rcu *table; + + table = container_of(rp, struct em_perf_table, rcu); + kfree(table); +} + +static void em_release_table_kref(struct kref *kref) +{ + struct em_perf_table __rcu *table; + + /* It was the last owner of this table so we can free */ + table = container_of(kref, struct em_perf_table, kref); + + call_rcu(&table->rcu, em_destroy_table_rcu); +} + +/** + * em_table_free() - Handles safe free of the EM table when needed + * @table : EM table which is going to be freed + * + * No return values. + */ +void em_table_free(struct em_perf_table __rcu *table) +{ + kref_put(&table->kref, em_release_table_kref); +} + +/** + * em_table_alloc() - Allocate a new EM table + * @pd : EM performance domain for which this must be done + * + * Allocate a new EM table and initialize its kref to indicate that it + * has a user. + * Returns allocated table or NULL. + */ +struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +{ + struct em_perf_table __rcu *table; + int table_size; + + table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + + table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL); + if (!table) + return NULL; + + kref_init(&table->kref); + + return table; +} + +static void em_init_performance(struct device *dev, struct em_perf_domain *pd, + struct em_perf_state *table, int nr_states) +{ + u64 fmax, max_cap; + int i, cpu; + + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return; + + cpu = cpumask_first(em_span_cpus(pd)); + + /* + * Calculate the performance value for each frequency with + * linear relationship. The final CPU capacity might not be ready at + * boot time, but the EM will be updated a bit later with correct one. + */ + fmax = (u64) table[nr_states - 1].frequency; + max_cap = (u64) arch_scale_cpu_capacity(cpu); + for (i = 0; i < nr_states; i++) + table[i].performance = div64_u64(max_cap * table[i].frequency, + fmax); +} + +static int em_compute_costs(struct device *dev, struct em_perf_state *table, + struct em_data_callback *cb, int nr_states, + unsigned long flags) +{ + unsigned long prev_cost = ULONG_MAX; + int i, ret; + + /* Compute the cost of each performance state. */ + for (i = nr_states - 1; i >= 0; i--) { + unsigned long power_res, cost; + + if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + return -EINVAL; + } + } else { + /* increase resolution of 'cost' precision */ + power_res = table[i].power * 10; + cost = power_res / table[i].performance; + } + + table[i].cost = cost; + + if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } + } + + return 0; +} + +/** + * em_dev_compute_costs() - Calculate cost values for new runtime EM table + * @dev : Device for which the EM table is to be updated + * @table : The new EM table that is going to get the costs calculated + * @nr_states : Number of performance states + * + * Calculate the em_perf_state::cost values for new runtime EM table. The + * values are used for EAS during task placement. It also calculates and sets + * the efficiency flag for each performance state. When the function finish + * successfully the EM table is ready to be updated and used by EAS. + * + * Return 0 on success or a proper error in case of failure. + */ +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states) +{ + return em_compute_costs(dev, table, NULL, nr_states, 0); +} + +/** + * em_dev_update_perf_domain() - Update runtime EM table for a device + * @dev : Device for which the EM is to be updated + * @new_table : The new EM table that is going to be used from now + * + * Update EM runtime modifiable table for the @dev using the provided @table. + * + * This function uses a mutex to serialize writers, so it must not be called + * from a non-sleeping context. + * + * Return 0 on success or an error code on failure. + */ +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table __rcu *new_table) +{ + struct em_perf_table __rcu *old_table; + struct em_perf_domain *pd; + + if (!dev) + return -EINVAL; + + /* Serialize update/unregister or concurrent updates */ + mutex_lock(&em_pd_mutex); + + if (!dev->em_pd) { + mutex_unlock(&em_pd_mutex); + return -EINVAL; + } + pd = dev->em_pd; + + kref_get(&new_table->kref); + + old_table = pd->em_table; + rcu_assign_pointer(pd->em_table, new_table); + + em_cpufreq_update_efficiencies(dev, new_table->state); + + em_table_free(old_table); + + mutex_unlock(&em_pd_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); + static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb, + struct em_perf_state *table, + struct em_data_callback *cb, unsigned long flags) { - unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; - struct em_perf_state *table; + unsigned long power, freq, prev_freq = 0; + int nr_states = pd->nr_perf_states; int i, ret; - u64 fmax; - - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { @@ -127,7 +359,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); - goto free_ps_table; + return -EINVAL; } /* @@ -137,7 +369,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, if (freq <= prev_freq) { dev_err(dev, "EM: non-increasing freq: %lu\n", freq); - goto free_ps_table; + return -EINVAL; } /* @@ -147,55 +379,27 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", power); - goto free_ps_table; + return -EINVAL; } table[i].power = power; table[i].frequency = prev_freq = freq; } - /* Compute the cost of each performance state. */ - fmax = (u64) table[nr_states - 1].frequency; - for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res, cost; - - if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { - ret = cb->get_cost(dev, table[i].frequency, &cost); - if (ret || !cost || cost > EM_MAX_POWER) { - dev_err(dev, "EM: invalid cost %lu %d\n", - cost, ret); - goto free_ps_table; - } - } else { - power_res = table[i].power; - cost = div64_u64(fmax * power_res, table[i].frequency); - } - - table[i].cost = cost; - - if (table[i].cost >= prev_cost) { - table[i].flags = EM_PERF_STATE_INEFFICIENT; - dev_dbg(dev, "EM: OPP:%lu is inefficient\n", - table[i].frequency); - } else { - prev_cost = table[i].cost; - } - } + em_init_performance(dev, pd, table, nr_states); - pd->table = table; - pd->nr_perf_states = nr_states; + ret = em_compute_costs(dev, table, cb, nr_states, flags); + if (ret) + return -EINVAL; return 0; - -free_ps_table: - kfree(table); - return -EINVAL; } static int em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, cpumask_t *cpus, unsigned long flags) { + struct em_perf_table __rcu *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -220,11 +424,17 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb, flags); - if (ret) { - kfree(pd); - return ret; - } + pd->nr_perf_states = nr_states; + + em_table = em_table_alloc(pd); + if (!em_table) + goto free_pd; + + ret = em_create_perf_table(dev, pd, em_table->state, cb, flags); + if (ret) + goto free_pd_table; + + rcu_assign_pointer(pd->em_table, em_table); if (_is_cpu_device(dev)) for_each_cpu(cpu, cpus) { @@ -235,26 +445,37 @@ static int em_create_pd(struct device *dev, int nr_states, dev->em_pd = pd; return 0; + +free_pd_table: + kfree(em_table); +free_pd: + kfree(pd); + return -EINVAL; } -static void em_cpufreq_update_efficiencies(struct device *dev) +static void +em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table) { struct em_perf_domain *pd = dev->em_pd; - struct em_perf_state *table; struct cpufreq_policy *policy; int found = 0; - int i; + int i, cpu; - if (!_is_cpu_device(dev) || !pd) + if (!_is_cpu_device(dev)) return; - policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); - if (!policy) { - dev_warn(dev, "EM: Access to CPUFreq policy failed"); + /* Try to get a CPU which is active and in this PD */ + cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask); + if (cpu >= nr_cpu_ids) { + dev_warn(dev, "EM: No online CPU for CPUFreq policy\n"); return; } - table = pd->table; + policy = cpufreq_cpu_get(cpu); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed\n"); + return; + } for (i = 0; i < pd->nr_perf_states; i++) { if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) @@ -391,19 +612,34 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, else if (cb->get_cost) flags |= EM_PERF_DOMAIN_ARTIFICIAL; + /* + * EM only supports uW (exception is artificial EM). + * Therefore, check and force the drivers to provide + * power in uW. + */ + if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) { + dev_err(dev, "EM: only supports uW power values\n"); + ret = -EINVAL; + goto unlock; + } + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; dev->em_pd->flags |= flags; - em_cpufreq_update_efficiencies(dev); + em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); unlock: mutex_unlock(&em_pd_mutex); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + return ret; } EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); @@ -430,9 +666,125 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - kfree(dev->em_pd->table); + em_table_free(dev->em_pd->em_table); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(struct device *dev, + struct em_perf_domain *pd, + u64 max_cap) +{ + struct em_perf_table __rcu *em_table; + struct em_perf_state *ps, *new_ps; + int ret, ps_size; + + em_table = em_table_alloc(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + new_ps = em_table->state; + + rcu_read_lock(); + ps = em_perf_state_from_pd(pd); + /* Initialize data based on old table */ + ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + memcpy(new_ps, ps, ps_size); + + rcu_read_unlock(); + + em_init_performance(dev, pd, new_ps, pd->nr_perf_states); + ret = em_compute_costs(dev, new_ps, NULL, pd->nr_perf_states, + pd->flags); + if (ret) { + dev_warn(dev, "EM: compute costs failed\n"); + return; + } + + ret = em_dev_update_perf_domain(dev, em_table); + if (ret) + dev_warn(dev, "EM: update failed %d\n", ret); + + /* + * This is one-time-update, so give up the ownership in this updater. + * The EM framework has incremented the usage counter and from now + * will keep the reference (then free the memory when needed). + */ + em_table_free(em_table); +} + +static void em_check_capacity_update(void) +{ + cpumask_var_t cpu_done_mask; + struct em_perf_state *table; + struct em_perf_domain *pd; + unsigned long cpu_capacity; + int cpu; + + if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { + pr_warn("no free memory\n"); + return; + } + + /* Check if CPUs capacity has changed than update EM */ + for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy; + unsigned long em_max_perf; + struct device *dev; + + if (cpumask_test_cpu(cpu, cpu_done_mask)) + continue; + + policy = cpufreq_cpu_get(cpu); + if (!policy) { + pr_debug("Accessing cpu%d policy failed\n", cpu); + schedule_delayed_work(&em_update_work, + msecs_to_jiffies(1000)); + break; + } + cpufreq_cpu_put(policy); + + pd = em_cpu_get(cpu); + if (!pd || em_is_artificial(pd)) + continue; + + cpumask_or(cpu_done_mask, cpu_done_mask, + em_span_cpus(pd)); + + cpu_capacity = arch_scale_cpu_capacity(cpu); + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + /* + * Check if the CPU capacity has been adjusted during boot + * and trigger the update for new performance values. + */ + if (em_max_perf == cpu_capacity) + continue; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", + cpu, cpu_capacity, em_max_perf); + + dev = get_cpu_device(cpu); + em_adjust_new_capacity(dev, pd, cpu_capacity); + } + + free_cpumask_var(cpu_done_mask); +} + +static void em_update_workfn(struct work_struct *work) +{ + em_check_capacity_update(); +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 4b0b7cf2e019..43b1a82e800c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -47,6 +47,15 @@ dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; +static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP; + +/* + * Compression/decompression algorithm to be used while saving/loading + * image to/from disk. This would later be used in 'kernel/power/swap.c' + * to allocate comp streams. + */ +char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; + enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, @@ -718,6 +727,9 @@ static int load_image_and_restore(void) return error; } +#define COMPRESSION_ALGO_LZO "lzo" +#define COMPRESSION_ALGO_LZ4 "lz4" + /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -732,6 +744,17 @@ int hibernate(void) return -EPERM; } + /* + * Query for the compression algorithm support if compression is enabled. + */ + if (!nocompress) { + strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); + if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + pr_err("%s compression is not available\n", hib_comp_algo); + return -EOPNOTSUPP; + } + } + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { @@ -766,11 +789,24 @@ int hibernate(void) if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; - if (nocompress) + if (nocompress) { flags |= SF_NOCOMPRESS_MODE; - else + } else { flags |= SF_CRC32_MODE; + /* + * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4 + * to override this behaviour and use LZ4. + * + * Refer kernel/power/power.h for more details + */ + + if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4)) + flags |= SF_COMPRESSION_ALG_LZ4; + else + flags |= SF_COMPRESSION_ALG_LZO; + } + pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -955,6 +991,22 @@ static int software_resume(void) if (error) goto Unlock; + /* + * Check if the hibernation image is compressed. If so, query for + * the algorithm support. + */ + if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { + if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); + else + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); + if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + pr_err("%s compression is not available\n", hib_comp_algo); + error = -EOPNOTSUPP; + goto Unlock; + } + } + /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; @@ -1370,6 +1422,57 @@ static int __init nohibernate_setup(char *str) return 1; } +static const char * const comp_alg_enabled[] = { +#if IS_ENABLED(CONFIG_CRYPTO_LZO) + COMPRESSION_ALGO_LZO, +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) + COMPRESSION_ALGO_LZ4, +#endif +}; + +static int hibernate_compressor_param_set(const char *compressor, + const struct kernel_param *kp) +{ + unsigned int sleep_flags; + int index, ret; + + sleep_flags = lock_system_sleep(); + + index = sysfs_match_string(comp_alg_enabled, compressor); + if (index >= 0) { + ret = param_set_copystring(comp_alg_enabled[index], kp); + if (!ret) + strscpy(hib_comp_algo, comp_alg_enabled[index], + sizeof(hib_comp_algo)); + } else { + ret = index; + } + + unlock_system_sleep(sleep_flags); + + if (ret) + pr_debug("Cannot set specified compressor %s\n", + compressor); + + return ret; +} + +static const struct kernel_param_ops hibernate_compressor_param_ops = { + .set = hibernate_compressor_param_set, + .get = param_get_string, +}; + +static struct kparam_string hibernate_compressor_param_string = { + .maxlen = sizeof(hibernate_compressor), + .string = hibernate_compressor, +}; + +module_param_cb(compressor, &hibernate_compressor_param_ops, + &hibernate_compressor_param_string, 0644); +MODULE_PARM_DESC(compressor, + "Compression algorithm to be used with hibernation"); + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index b1ae9b677d03..a9e0693aaf69 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -95,19 +95,6 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -void pm_report_hw_sleep_time(u64 t) -{ - suspend_stats.last_hw_sleep = t; - suspend_stats.total_hw_sleep += t; -} -EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); - -void pm_report_max_hw_sleep(u64 t) -{ - suspend_stats.max_hw_sleep = t; -} -EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); - int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; @@ -319,26 +306,86 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ -static char *suspend_step_name(enum suspend_stat_step step) -{ - switch (step) { - case SUSPEND_FREEZE: - return "freeze"; - case SUSPEND_PREPARE: - return "prepare"; - case SUSPEND_SUSPEND: - return "suspend"; - case SUSPEND_SUSPEND_NOIRQ: - return "suspend_noirq"; - case SUSPEND_RESUME_NOIRQ: - return "resume_noirq"; - case SUSPEND_RESUME: - return "resume"; - default: - return ""; +#define SUSPEND_NR_STEPS SUSPEND_RESUME +#define REC_FAILED_NUM 2 + +struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; + unsigned int success; + unsigned int fail; + int last_failed_dev; + char failed_devs[REC_FAILED_NUM][40]; + int last_failed_errno; + int errno[REC_FAILED_NUM]; + int last_failed_step; + u64 last_hw_sleep; + u64 total_hw_sleep; + u64 max_hw_sleep; + enum suspend_stat_step failed_steps[REC_FAILED_NUM]; +}; + +static struct suspend_stats suspend_stats; +static DEFINE_MUTEX(suspend_stats_lock); + +void dpm_save_failed_dev(const char *name) +{ + mutex_lock(&suspend_stats_lock); + + strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], + name, sizeof(suspend_stats.failed_devs[0])); + suspend_stats.last_failed_dev++; + suspend_stats.last_failed_dev %= REC_FAILED_NUM; + + mutex_unlock(&suspend_stats_lock); +} + +void dpm_save_failed_step(enum suspend_stat_step step) +{ + suspend_stats.step_failures[step-1]++; + suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; + suspend_stats.last_failed_step++; + suspend_stats.last_failed_step %= REC_FAILED_NUM; +} + +void dpm_save_errno(int err) +{ + if (!err) { + suspend_stats.success++; + return; } + + suspend_stats.fail++; + + suspend_stats.errno[suspend_stats.last_failed_errno] = err; + suspend_stats.last_failed_errno++; + suspend_stats.last_failed_errno %= REC_FAILED_NUM; } +void pm_report_hw_sleep_time(u64 t) +{ + suspend_stats.last_hw_sleep = t; + suspend_stats.total_hw_sleep += t; +} +EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); + +void pm_report_max_hw_sleep(u64 t) +{ + suspend_stats.max_hw_sleep = t; +} +EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); + +static const char * const suspend_step_names[] = { + [SUSPEND_WORKING] = "", + [SUSPEND_FREEZE] = "freeze", + [SUSPEND_PREPARE] = "prepare", + [SUSPEND_SUSPEND] = "suspend", + [SUSPEND_SUSPEND_LATE] = "suspend_late", + [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", + [SUSPEND_RESUME_NOIRQ] = "resume_noirq", + [SUSPEND_RESUME_EARLY] = "resume_early", + [SUSPEND_RESUME] = "resume", +}; + #define suspend_attr(_name, format_str) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ @@ -347,20 +394,30 @@ static ssize_t _name##_show(struct kobject *kobj, \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) -suspend_attr(success, "%d\n"); -suspend_attr(fail, "%d\n"); -suspend_attr(failed_freeze, "%d\n"); -suspend_attr(failed_prepare, "%d\n"); -suspend_attr(failed_suspend, "%d\n"); -suspend_attr(failed_suspend_late, "%d\n"); -suspend_attr(failed_suspend_noirq, "%d\n"); -suspend_attr(failed_resume, "%d\n"); -suspend_attr(failed_resume_early, "%d\n"); -suspend_attr(failed_resume_noirq, "%d\n"); +suspend_attr(success, "%u\n"); +suspend_attr(fail, "%u\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); +#define suspend_step_attr(_name, step) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%u\n", \ + suspend_stats.step_failures[step-1]); \ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_step_attr(failed_freeze, SUSPEND_FREEZE); +suspend_step_attr(failed_prepare, SUSPEND_PREPARE); +suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); +suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); +suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); +suspend_step_attr(failed_resume, SUSPEND_RESUME); +suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); +suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); + static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -392,16 +449,14 @@ static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int index; enum suspend_stat_step step; - char *last_failed_step = NULL; + int index; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; - last_failed_step = suspend_step_name(step); - return sprintf(buf, "%s\n", last_failed_step); + return sprintf(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); @@ -449,6 +504,7 @@ static const struct attribute_group suspend_attr_group = { static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; + enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; @@ -456,47 +512,35 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", - "success", suspend_stats.success, - "fail", suspend_stats.fail, - "failed_freeze", suspend_stats.failed_freeze, - "failed_prepare", suspend_stats.failed_prepare, - "failed_suspend", suspend_stats.failed_suspend, - "failed_suspend_late", - suspend_stats.failed_suspend_late, - "failed_suspend_noirq", - suspend_stats.failed_suspend_noirq, - "failed_resume", suspend_stats.failed_resume, - "failed_resume_early", - suspend_stats.failed_resume_early, - "failed_resume_noirq", - suspend_stats.failed_resume_noirq); + + seq_printf(s, "success: %u\nfail: %u\n", + suspend_stats.success, suspend_stats.fail); + + for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) + seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], + suspend_stats.step_failures[step-1]); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", - suspend_stats.failed_devs[last_dev]); + suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-s\n", - suspend_stats.failed_devs[index]); + seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-d\n", - suspend_stats.errno[index]); + seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[last_step])); + suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[index])); + suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; diff --git a/kernel/power/power.h b/kernel/power/power.h index 8499a39c62f4..de0e6b1077f2 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -6,6 +6,7 @@ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpuidle.h> +#include <linux/crypto.h> struct swsusp_info { struct new_utsname uts; @@ -54,6 +55,10 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; +extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; + +/* kernel/power/swap.c */ +extern unsigned int swsusp_header_flags; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); @@ -148,7 +153,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); extern int snapshot_read_next(struct snapshot_handle *handle); extern int snapshot_write_next(struct snapshot_handle *handle); -extern void snapshot_write_finalize(struct snapshot_handle *handle); +int snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); extern bool hibernate_acquire(void); @@ -162,11 +167,25 @@ extern int swsusp_swap_in_use(void); * Flags that can be passed from the hibernatig hernel to the "boot" kernel in * the image header. */ +#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 #define SF_HW_SIG 8 +/* + * Bit to indicate the compression algorithm to be used(for LZ4). The same + * could be checked while saving/loading image to/from disk to use the + * corresponding algorithms. + * + * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use + * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4. + * + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4 + */ +#define SF_COMPRESSION_ALG_LZ4 16 + /* kernel/power/hibernate.c */ int swsusp_check(bool exclusive); extern void swsusp_free(void); @@ -327,3 +346,5 @@ static inline void pm_sleep_enable_secondary_cpus(void) suspend_enable_secondary_cpus(); cpuidle_resume(); } + +void dpm_save_errno(int err); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5c96ff067c64..405eddbda4fc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -58,22 +58,24 @@ static inline void hibernate_restore_protection_end(void) hibernate_restore_protection_active = false; } -static inline void hibernate_restore_protect_page(void *page_address) +static inline int __must_check hibernate_restore_protect_page(void *page_address) { if (hibernate_restore_protection_active) - set_memory_ro((unsigned long)page_address, 1); + return set_memory_ro((unsigned long)page_address, 1); + return 0; } -static inline void hibernate_restore_unprotect_page(void *page_address) +static inline int hibernate_restore_unprotect_page(void *page_address) { if (hibernate_restore_protection_active) - set_memory_rw((unsigned long)page_address, 1); + return set_memory_rw((unsigned long)page_address, 1); + return 0; } #else static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} -static inline void hibernate_restore_protect_page(void *page_address) {} -static inline void hibernate_restore_unprotect_page(void *page_address) {} +static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; } +static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; } #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ @@ -2832,7 +2834,9 @@ next: } } else { copy_last_highmem_page(); - hibernate_restore_protect_page(handle->buffer); + error = hibernate_restore_protect_page(handle->buffer); + if (error) + return error; handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2858,15 +2862,18 @@ next: * stored in highmem. Additionally, it recycles bitmap memory that's not * necessary any more. */ -void snapshot_write_finalize(struct snapshot_handle *handle) +int snapshot_write_finalize(struct snapshot_handle *handle) { + int error; + copy_last_highmem_page(); - hibernate_restore_protect_page(handle->buffer); + error = hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { memory_bm_recycle(&orig_bm); free_highmem_data(); } + return error; } int snapshot_image_loaded(struct snapshot_handle *handle) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index fa3bf161d13f..e3ae93bbcb9b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -192,6 +192,7 @@ static int __init mem_sleep_default_setup(char *str) if (mem_sleep_labels[state] && !strcmp(str, mem_sleep_labels[state])) { mem_sleep_default = state; + mem_sleep_current = state; break; } @@ -367,7 +368,6 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; - suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: @@ -617,12 +617,7 @@ int pm_suspend(suspend_state_t state) pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else { - suspend_stats.success++; - } + dpm_save_errno(error); pr_info("suspend exit\n"); return error; } diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index b663a97f5867..d4856ec61570 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -201,7 +201,7 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm); if (dev) { rtc = rtc_class_open(dev_name(dev)); put_device(dev); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 692f12fe60c1..5bc04bfe2db1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -23,7 +23,6 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> -#include <linux/lzo.h> #include <linux/vmalloc.h> #include <linux/cpumask.h> #include <linux/atomic.h> @@ -339,6 +338,13 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) return error; } +/* + * Hold the swsusp_header flag. This is used in software_resume() in + * 'kernel/power/hibernate' to check if the image is compressed and query + * for the compression algorithm support(if so). + */ +unsigned int swsusp_header_flags; + /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) @@ -514,25 +520,30 @@ static int swap_writer_finish(struct swap_map_handle *handle, return error; } +/* + * Bytes we need for compressed data in worst case. We assume(limitation) + * this is the worst of all the compression algorithms. + */ +#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2) + /* We need to remember how much compressed data we need to read. */ -#define LZO_HEADER sizeof(size_t) +#define CMP_HEADER sizeof(size_t) /* Number of pages/bytes we'll compress at one time. */ -#define LZO_UNC_PAGES 32 -#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) +#define UNC_PAGES 32 +#define UNC_SIZE (UNC_PAGES * PAGE_SIZE) -/* Number of pages/bytes we need for compressed data (worst case). */ -#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ - LZO_HEADER, PAGE_SIZE) -#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Number of pages we need for compressed data (worst case). */ +#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \ + CMP_HEADER, PAGE_SIZE) +#define CMP_SIZE (CMP_PAGES * PAGE_SIZE) /* Maximum number of threads for compression/decompression. */ -#define LZO_THREADS 3 +#define CMP_THREADS 3 /* Minimum/maximum number of pages for read buffering. */ -#define LZO_MIN_RD_PAGES 1024 -#define LZO_MAX_RD_PAGES 8192 - +#define CMP_MIN_RD_PAGES 1024 +#define CMP_MAX_RD_PAGES 8192 /** * save_image - save the suspend image data @@ -593,8 +604,8 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ - unsigned char *unc[LZO_THREADS]; /* uncompressed data */ + size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ + unsigned char *unc[CMP_THREADS]; /* uncompressed data */ }; /* @@ -625,10 +636,11 @@ static int crc32_threadfn(void *data) return 0; } /* - * Structure used for LZO data compression. + * Structure used for data compression. */ struct cmp_data { struct task_struct *thr; /* thread */ + struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -636,17 +648,20 @@ struct cmp_data { wait_queue_head_t done; /* compression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ - unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; +/* Indicates the image size after compression */ +static atomic_t compressed_size = ATOMIC_INIT(0); + /* * Compression function that runs in its own thread. */ -static int lzo_compress_threadfn(void *data) +static int compress_threadfn(void *data) { struct cmp_data *d = data; + unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -660,9 +675,13 @@ static int lzo_compress_threadfn(void *data) } atomic_set(&d->ready, 0); - d->ret = lzo1x_1_compress(d->unc, d->unc_len, - d->cmp + LZO_HEADER, &d->cmp_len, - d->wrk); + cmp_len = CMP_SIZE - CMP_HEADER; + d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, + d->cmp + CMP_HEADER, + &cmp_len); + d->cmp_len = cmp_len; + + atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); wake_up(&d->done); } @@ -670,14 +689,14 @@ static int lzo_compress_threadfn(void *data) } /** - * save_image_lzo - Save the suspend image data compressed with LZO. + * save_compressed_image - Save the suspend image data after compression. * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ -static int save_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) +static int save_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) { unsigned int m; int ret = 0; @@ -694,23 +713,25 @@ static int save_image_lzo(struct swap_map_handle *handle, hib_init_batch(&hb); + atomic_set(&compressed_size, 0); + /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { - pr_err("Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { - pr_err("Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } @@ -729,7 +750,14 @@ static int save_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_compress_threadfn, + data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { @@ -767,7 +795,7 @@ static int save_image_lzo(struct swap_map_handle *handle, */ handle->reqd_free_pages = reqd_free_pages(); - pr_info("Using %u thread(s) for compression\n", nr_threads); + pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo); pr_info("Compressing and saving image data (%u pages)...\n", nr_to_write); m = nr_to_write / 10; @@ -777,7 +805,7 @@ static int save_image_lzo(struct swap_map_handle *handle, start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) { ret = snapshot_read_next(snapshot); if (ret < 0) goto out_finish; @@ -817,14 +845,14 @@ static int save_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - pr_err("LZO compression failed\n"); + pr_err("%s compression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(data[thr].unc_len))) { - pr_err("Invalid LZO compressed length\n"); + bytes_worst_compress(data[thr].unc_len))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -840,7 +868,7 @@ static int save_image_lzo(struct swap_map_handle *handle, * read it. */ for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); @@ -862,6 +890,9 @@ out_finish: if (!ret) pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %d kbytes\n", + (atomic_read(&compressed_size) / 1024)); + out_clean: hib_finish_batch(&hb); if (crc) { @@ -870,9 +901,12 @@ out_clean: kfree(crc); } if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + if (data[thr].cc) + crypto_free_comp(data[thr].cc); + } vfree(data); } if (page) free_page((unsigned long)page); @@ -942,7 +976,7 @@ int swsusp_write(unsigned int flags) if (!error) { error = (flags & SF_NOCOMPRESS_MODE) ? save_image(&handle, &snapshot, pages - 1) : - save_image_lzo(&handle, &snapshot, pages - 1); + save_compressed_image(&handle, &snapshot, pages - 1); } out_finish: error = swap_writer_finish(&handle, flags, error); @@ -1100,8 +1134,8 @@ static int load_image(struct swap_map_handle *handle, ret = err2; if (!ret) { pr_info("Image loading done\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; } swsusp_show_speed(start, stop, nr_to_read, "Read"); @@ -1109,10 +1143,11 @@ static int load_image(struct swap_map_handle *handle, } /* - * Structure used for LZO data decompression. + * Structure used for data decompression. */ struct dec_data { struct task_struct *thr; /* thread */ + struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1120,16 +1155,17 @@ struct dec_data { wait_queue_head_t done; /* decompression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; /* * Decompression function that runs in its own thread. */ -static int lzo_decompress_threadfn(void *data) +static int decompress_threadfn(void *data) { struct dec_data *d = data; + unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -1143,9 +1179,11 @@ static int lzo_decompress_threadfn(void *data) } atomic_set(&d->ready, 0); - d->unc_len = LZO_UNC_SIZE; - d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, - d->unc, &d->unc_len); + unc_len = UNC_SIZE; + d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, + d->unc, &unc_len); + d->unc_len = unc_len; + if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); @@ -1157,14 +1195,14 @@ static int lzo_decompress_threadfn(void *data) } /** - * load_image_lzo - Load compressed image data and decompress them with LZO. + * load_compressed_image - Load compressed image data and decompress it. * @handle: Swap map handle to use for loading data. * @snapshot: Image to copy uncompressed data into. * @nr_to_read: Number of pages to load. */ -static int load_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) +static int load_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) { unsigned int m; int ret = 0; @@ -1189,18 +1227,18 @@ static int load_image_lzo(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); - page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page))); + page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page))); if (!page) { - pr_err("Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { - pr_err("Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } @@ -1221,7 +1259,14 @@ static int load_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_decompress_threadfn, + data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { @@ -1262,18 +1307,18 @@ static int load_image_lzo(struct swap_map_handle *handle, */ if (low_free_pages() > snapshot_get_image_size()) read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; - read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { - page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + page[i] = (void *)__get_free_page(i < CMP_PAGES ? GFP_NOIO | __GFP_HIGH : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { - if (i < LZO_CMP_PAGES) { + if (i < CMP_PAGES) { ring_size = i; - pr_err("Failed to allocate LZO pages\n"); + pr_err("Failed to allocate %s pages\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } else { @@ -1283,7 +1328,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } want = ring_size = i; - pr_info("Using %u thread(s) for decompression\n", nr_threads); + pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo); pr_info("Loading and decompressing image data (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1344,13 +1389,13 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].cmp_len = *(size_t *)page[pg]; if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(LZO_UNC_SIZE))) { - pr_err("Invalid LZO compressed length\n"); + bytes_worst_compress(UNC_SIZE))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } - need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER, PAGE_SIZE); if (need > have) { if (eof > 1) { @@ -1361,7 +1406,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(data[thr].cmp + off, page[pg], PAGE_SIZE); @@ -1378,7 +1423,7 @@ static int load_image_lzo(struct swap_map_handle *handle, /* * Wait for more data while we are decompressing. */ - if (have < LZO_CMP_PAGES && asked) { + if (have < CMP_PAGES && asked) { ret = hib_wait_io(&hb); if (ret) goto out_finish; @@ -1396,14 +1441,14 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - pr_err("LZO decompression failed\n"); + pr_err("%s decompression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].unc_len || - data[thr].unc_len > LZO_UNC_SIZE || - data[thr].unc_len & (PAGE_SIZE - 1))) { - pr_err("Invalid LZO uncompressed length\n"); + data[thr].unc_len > UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + pr_err("Invalid %s uncompressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -1441,8 +1486,8 @@ out_finish: stop = ktime_get(); if (!ret) { pr_info("Image loading done\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { @@ -1464,9 +1509,12 @@ out_clean: kfree(crc); } if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + if (data[thr].cc) + crypto_free_comp(data[thr].cc); + } vfree(data); } vfree(page); @@ -1500,7 +1548,7 @@ int swsusp_read(unsigned int *flags_p) if (!error) { error = (*flags_p & SF_NOCOMPRESS_MODE) ? load_image(&handle, &snapshot, header->pages - 1) : - load_image_lzo(&handle, &snapshot, header->pages - 1); + load_compressed_image(&handle, &snapshot, header->pages - 1); } swap_reader_finish(&handle); end: @@ -1535,6 +1583,7 @@ int swsusp_check(bool exclusive) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, diff --git a/kernel/power/user.c b/kernel/power/user.c index 3a4e70366f35..3aa41ba22129 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -317,7 +317,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_ATOMIC_RESTORE: - snapshot_write_finalize(&data->handle); + error = snapshot_write_finalize(&data->handle); + if (error) + break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index b96077152f49..c8093bcc01fe 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -140,39 +140,6 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); } -#ifdef CONFIG_64BIT - -#define __seq_to_nbcon_seq(seq) (seq) -#define __nbcon_seq_to_seq(seq) (seq) - -#else /* CONFIG_64BIT */ - -#define __seq_to_nbcon_seq(seq) ((u32)seq) - -static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq) -{ - u64 seq; - u64 rb_next_seq; - - /* - * The provided sequence is only the lower 32 bits of the ringbuffer - * sequence. It needs to be expanded to 64bit. Get the next sequence - * number from the ringbuffer and fold it. - * - * Having a 32bit representation in the console is sufficient. - * If a console ever gets more than 2^31 records behind - * the ringbuffer then this is the least of the problems. - * - * Also the access to the ring buffer is always safe. - */ - rb_next_seq = prb_next_seq(prb); - seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq); - - return seq; -} - -#endif /* CONFIG_64BIT */ - /** * nbcon_seq_read - Read the current console sequence * @con: Console to read the sequence of @@ -183,7 +150,7 @@ u64 nbcon_seq_read(struct console *con) { unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); - return __nbcon_seq_to_seq(nbcon_seq); + return __ulseq_to_u64seq(prb, nbcon_seq); } /** @@ -204,7 +171,7 @@ void nbcon_seq_force(struct console *con, u64 seq) */ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); - atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq)); + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ con->seq = 0; @@ -223,11 +190,11 @@ void nbcon_seq_force(struct console *con, u64 seq) */ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) { - unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq); + unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); struct console *con = ctxt->console; if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, - __seq_to_nbcon_seq(new_seq))) { + __u64seq_to_ulseq(new_seq))) { ctxt->seq = new_seq; } else { ctxt->seq = nbcon_seq_read(con); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f2444b581e16..ca5146006b94 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -34,7 +34,7 @@ #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> @@ -347,6 +347,29 @@ static bool panic_in_progress(void) return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } +/* Return true if a panic is in progress on the current CPU. */ +bool this_cpu_in_panic(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool other_cpu_in_panic(void) +{ + return (panic_in_progress() && !this_cpu_in_panic()); +} + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -439,12 +462,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK -/* - * During panic, heavy printk by other CPUs can delay the - * panic and risk deadlock on console resources. - */ -static int __read_mostly suppress_panic_printk; - DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -598,17 +615,6 @@ static int check_syslog_permissions(int type, int source) if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; - /* - * For historical reasons, accept CAP_SYS_ADMIN too, with - * a warning. - */ - if (capable(CAP_SYS_ADMIN)) { - pr_warn_once("%s (%d): Attempt to access syslog with " - "CAP_SYS_ADMIN but no CAP_SYSLOG " - "(deprecated).\n", - current->comm, task_pid_nr(current)); - goto ok; - } return -EPERM; } ok: @@ -951,7 +957,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * @@ -1846,10 +1852,23 @@ static bool console_waiter; */ static void console_lock_spinning_enable(void) { + /* + * Do not use spinning in panic(). The panic CPU wants to keep the lock. + * Non-panic CPUs abandon the flush anyway. + * + * Just keep the lockdep annotation. The panic-CPU should avoid + * taking console_owner_lock because it might cause a deadlock. + * This looks like the easiest way how to prevent false lockdep + * reports without handling races a lockless way. + */ + if (panic_in_progress()) + goto lockdep; + raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); +lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } @@ -1874,6 +1893,22 @@ static int console_lock_spinning_disable_and_check(int cookie) { int waiter; + /* + * Ignore spinning waiters during panic() because they might get stopped + * or blocked at any time, + * + * It is safe because nobody is allowed to start spinning during panic + * in the first place. If there has been a waiter then non panic CPUs + * might stay spinning. They would get stopped anyway. The panic context + * will never start spinning and an interrupted spin on panic CPU will + * never continue. + */ + if (panic_in_progress()) { + /* Keep lockdep happy. */ + spin_release(&console_owner_dep_map, _THIS_IP_); + return 0; + } + raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; @@ -2270,8 +2305,12 @@ asmlinkage int vprintk_emit(int facility, int level, if (unlikely(suppress_printk)) return 0; - if (unlikely(suppress_panic_printk) && - atomic_read(&panic_cpu) != raw_smp_processor_id()) + /* + * The messages on the panic CPU are the most important. If + * non-panic CPUs are generating any messages, they will be + * silently dropped. + */ + if (other_cpu_in_panic()) return 0; if (level == LOGLEVEL_SCHED) { @@ -2601,26 +2640,6 @@ static int console_cpu_notify(unsigned int cpu) return 0; } -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - if (!panic_in_progress()) - return false; - - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return atomic_read(&panic_cpu) != raw_smp_processor_id(); -} - /** * console_lock - block the console subsystem from printing * @@ -2776,8 +2795,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { - static int panic_console_dropped; - struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); @@ -2805,17 +2822,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; - /* - * Check for dropped messages in panic here so that printk - * suppression can occur as early as possible if necessary. - */ - if (pmsg->dropped && - panic_in_progress() && - panic_console_dropped++ > 10) { - suppress_panic_printk = 1; - pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); - } - /* Skip record that has level above the console loglevel. */ if (may_suppress && suppress_message_printing(r.info->level)) goto out; @@ -3263,6 +3269,21 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); +static int console_call_setup(struct console *newcon, char *options) +{ + int err; + + if (!newcon->setup) + return 0; + + /* Synchronize with possible boot console. */ + console_lock(); + err = newcon->setup(newcon, options); + console_unlock(); + + return err; +} + /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected @@ -3298,8 +3319,8 @@ static int try_enable_preferred_console(struct console *newcon, if (_braille_register_console(newcon, c)) return 0; - if (newcon->setup && - (err = newcon->setup(newcon, c->options)) != 0) + err = console_call_setup(newcon, c->options); + if (err) return err; } newcon->flags |= CON_ENABLED; @@ -3325,7 +3346,7 @@ static void try_enable_default_console(struct console *newcon) if (newcon->index < 0) newcon->index = 0; - if (newcon->setup && newcon->setup(newcon, NULL) != 0) + if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; @@ -3761,7 +3782,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre might_sleep(); - seq = prb_next_seq(prb); + seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ console_lock(); diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index fde338606ce8..88e8f3a61922 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -6,6 +6,7 @@ #include <linux/errno.h> #include <linux/bug.h> #include "printk_ringbuffer.h" +#include "internal.h" /** * DOC: printk_ringbuffer overview @@ -303,6 +304,9 @@ * * desc_push_tail:B / desc_reserve:D * set descriptor reusable (state), then push descriptor tail (id) + * + * desc_update_last_finalized:A / desc_last_finalized_seq:A + * store finalized record, then set new highest finalized sequence number */ #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) @@ -1030,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, unsigned long next_lpos; if (size == 0) { - /* Specify a data-less block. */ - blk_lpos->begin = NO_LPOS; - blk_lpos->next = NO_LPOS; + /* + * Data blocks are not created for empty lines. Instead, the + * reader will recognize these special lpos values and handle + * it appropriately. + */ + blk_lpos->begin = EMPTY_LINE_LPOS; + blk_lpos->next = EMPTY_LINE_LPOS; return NULL; } @@ -1210,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring, /* Data-less data block description. */ if (BLK_DATALESS(blk_lpos)) { - if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + /* + * Records that are just empty lines are also valid, even + * though they do not have a data block. For such records + * explicitly return empty string data to signify success. + */ + if (blk_lpos->begin == EMPTY_LINE_LPOS && + blk_lpos->next == EMPTY_LINE_LPOS) { *data_size = 0; return ""; } + + /* Data lost, invalid, or otherwise unavailable. */ return NULL; } @@ -1442,19 +1458,117 @@ fail_reopen: } /* + * @last_finalized_seq value guarantees that all records up to and including + * this sequence number are finalized and can be read. The only exception are + * too old records which have already been overwritten. + * + * It is also guaranteed that @last_finalized_seq only increases. + * + * Be aware that finalized records following non-finalized records are not + * reported because they are not yet available to the reader. For example, + * a new record stored via printk() will not be available to a printer if + * it follows a record that has not been finalized yet. However, once that + * non-finalized record becomes finalized, @last_finalized_seq will be + * appropriately updated and the full set of finalized records will be + * available to the printer. And since each printk() caller will either + * directly print or trigger deferred printing of all available unprinted + * records, all printk() messages will get printed. + */ +static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long ulseq; + + /* + * Guarantee the sequence number is loaded before loading the + * associated record in order to guarantee that the record can be + * seen by this CPU. This pairs with desc_update_last_finalized:A. + */ + ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq + ); /* LMM(desc_last_finalized_seq:A) */ + + return __ulseq_to_u64seq(rb, ulseq); +} + +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count); + +/* + * Check if there are records directly following @last_finalized_seq that are + * finalized. If so, update @last_finalized_seq to the latest of these + * records. It is not allowed to skip over records that are not yet finalized. + */ +static void desc_update_last_finalized(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + u64 old_seq = desc_last_finalized_seq(rb); + unsigned long oldval; + unsigned long newval; + u64 finalized_seq; + u64 try_seq; + +try_again: + finalized_seq = old_seq; + try_seq = finalized_seq + 1; + + /* Try to find later finalized records. */ + while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { + finalized_seq = try_seq; + try_seq++; + } + + /* No update needed if no later finalized record was found. */ + if (finalized_seq == old_seq) + return; + + oldval = __u64seq_to_ulseq(old_seq); + newval = __u64seq_to_ulseq(finalized_seq); + + /* + * Set the sequence number of a later finalized record that has been + * seen. + * + * Guarantee the record data is visible to other CPUs before storing + * its sequence number. This pairs with desc_last_finalized_seq:A. + * + * Memory barrier involvement: + * + * If desc_last_finalized_seq:A reads from + * desc_update_last_finalized:A, then desc_read:A reads from + * _prb_commit:B. + * + * Relies on: + * + * RELEASE from _prb_commit:B to desc_update_last_finalized:A + * matching + * ACQUIRE from desc_last_finalized_seq:A to desc_read:A + * + * Note: _prb_commit:B and desc_update_last_finalized:A can be + * different CPUs. However, the desc_update_last_finalized:A + * CPU (which performs the release) must have previously seen + * _prb_commit:B. + */ + if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, + &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ + old_seq = __ulseq_to_u64seq(rb, oldval); + goto try_again; + } +} + +/* * Attempt to finalize a specified descriptor. If this fails, the descriptor * is either already final or it will finalize itself when the writer commits. */ -static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) { + struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val = DESC_SV(id, desc_committed); struct prb_desc *d = to_desc(desc_ring, id); - atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, - DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ - - /* Best effort to remember the last finalized @id. */ - atomic_long_set(&desc_ring->last_finalized_id, id); + if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, + DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ + desc_update_last_finalized(rb); + } } /** @@ -1550,7 +1664,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * readers. (For seq==0 there is no previous descriptor.) */ if (info->seq > 0) - desc_make_final(desc_ring, DESC_ID(id - 1)); + desc_make_final(rb, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ @@ -1643,7 +1757,7 @@ void prb_commit(struct prb_reserved_entry *e) */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ if (head_id != e->id) - desc_make_final(desc_ring, e->id); + desc_make_final(e->rb, e->id); } /** @@ -1663,12 +1777,9 @@ void prb_commit(struct prb_reserved_entry *e) */ void prb_final_commit(struct prb_reserved_entry *e) { - struct prb_desc_ring *desc_ring = &e->rb->desc_ring; - _prb_commit(e, desc_finalized); - /* Best effort to remember the last finalized @id. */ - atomic_long_set(&desc_ring->last_finalized_id, e->id); + desc_update_last_finalized(e->rb); } /* @@ -1832,7 +1943,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, } /* Get the sequence number of the tail descriptor. */ -static u64 prb_first_seq(struct printk_ringbuffer *rb) +u64 prb_first_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; @@ -1875,12 +1986,123 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) return seq; } +/** + * prb_next_reserve_seq() - Get the sequence number after the most recently + * reserved record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what sequence + * number will be assigned to the next reserved record. + * + * Note that depending on the situation, this value can be equal to or + * higher than the sequence number returned by prb_next_seq(). + * + * Context: Any context. + * Return: The sequence number that will be assigned to the next record + * reserved. + */ +u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long last_finalized_id; + atomic_long_t *state_var; + u64 last_finalized_seq; + unsigned long head_id; + struct prb_desc desc; + unsigned long diff; + struct prb_desc *d; + int err; + + /* + * It may not be possible to read a sequence number for @head_id. + * So the ID of @last_finailzed_seq is used to calculate what the + * sequence number of @head_id will be. + */ + +try_again: + last_finalized_seq = desc_last_finalized_seq(rb); + + /* + * @head_id is loaded after @last_finalized_seq to ensure that + * it points to the record with @last_finalized_seq or newer. + * + * Memory barrier involvement: + * + * If desc_last_finalized_seq:A reads from + * desc_update_last_finalized:A, then + * prb_next_reserve_seq:A reads from desc_reserve:D. + * + * Relies on: + * + * RELEASE from desc_reserve:D to desc_update_last_finalized:A + * matching + * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A + * + * Note: desc_reserve:D and desc_update_last_finalized:A can be + * different CPUs. However, the desc_update_last_finalized:A CPU + * (which performs the release) must have previously seen + * desc_read:C, which implies desc_reserve:D can be seen. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ + + d = to_desc(desc_ring, last_finalized_seq); + state_var = &d->state_var; + + /* Extract the ID, used to specify the descriptor to read. */ + last_finalized_id = DESC_ID(atomic_long_read(state_var)); + + /* Ensure @last_finalized_id is correct. */ + err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); + + if (err == -EINVAL) { + if (last_finalized_seq == 0) { + /* + * No record has been finalized or even reserved yet. + * + * The @head_id is initialized such that the first + * increment will yield the first record (seq=0). + * Handle it separately to avoid a negative @diff + * below. + */ + if (head_id == DESC0_ID(desc_ring->count_bits)) + return 0; + + /* + * One or more descriptors are already reserved. Use + * the descriptor ID of the first one (@seq=0) for + * the @diff below. + */ + last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; + } else { + /* Record must have been overwritten. Try again. */ + goto try_again; + } + } + + /* Diff of known descriptor IDs to compute related sequence numbers. */ + diff = head_id - last_finalized_id; + + /* + * @head_id points to the most recently reserved record, but this + * function returns the sequence number that will be assigned to the + * next (not yet reserved) record. Thus +1 is needed. + */ + return (last_finalized_seq + diff + 1); +} + /* - * Non-blocking read of a record. Updates @seq to the last finalized record - * (which may have no data available). + * Non-blocking read of a record. + * + * On success @seq is updated to the record that was read and (if provided) + * @r and @line_count will contain the read/calculated data. + * + * On failure @seq is updated to a record that is not yet available to the + * reader, but it will be the next record available to the reader. * - * See the description of prb_read_valid() and prb_read_valid_info() - * for details. + * Note: When the current CPU is in panic, this function will skip over any + * non-existent/non-finalized records in order to allow the panic CPU + * to print any and all records that have been finalized. */ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count) @@ -1899,12 +2121,32 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, *seq = tail_seq; } else if (err == -ENOENT) { - /* Record exists, but no data available. Skip. */ + /* Record exists, but the data was lost. Skip. */ (*seq)++; } else { - /* Non-existent/non-finalized record. Must stop. */ - return false; + /* + * Non-existent/non-finalized record. Must stop. + * + * For panic situations it cannot be expected that + * non-finalized records will become finalized. But + * there may be other finalized records beyond that + * need to be printed for a panic situation. If this + * is the panic CPU, skip this + * non-existent/non-finalized record unless it is + * at or beyond the head, in which case it is not + * possible to continue. + * + * Note that new messages printed on panic CPU are + * finalized when we are here. The only exception + * might be the last message without trailing newline. + * But it would have the sequence number returned + * by "prb_next_reserve_seq() - 1". + */ + if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) + (*seq)++; + else + return false; } } @@ -1932,7 +2174,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * On success, the reader must check r->info.seq to see which record was * actually read. This allows the reader to detect dropped records. * - * Failure means @seq refers to a not yet written record. + * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r) @@ -1962,7 +2204,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, * On success, the reader must check info->seq to see which record meta data * was actually read. This allows the reader to detect dropped records. * - * Failure means @seq refers to a not yet written record. + * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count) @@ -2008,7 +2250,9 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) * newest sequence number available to readers will be. * * This provides readers a sequence number to jump to if all currently - * available records should be skipped. + * available records should be skipped. It is guaranteed that all records + * previous to the returned value have been finalized and are (or were) + * available to the reader. * * Context: Any context. * Return: The sequence number of the next newest (not yet available) record @@ -2016,34 +2260,19 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) */ u64 prb_next_seq(struct printk_ringbuffer *rb) { - struct prb_desc_ring *desc_ring = &rb->desc_ring; - enum desc_state d_state; - unsigned long id; u64 seq; - /* Check if the cached @id still points to a valid @seq. */ - id = atomic_long_read(&desc_ring->last_finalized_id); - d_state = desc_read(desc_ring, id, NULL, &seq, NULL); + seq = desc_last_finalized_seq(rb); - if (d_state == desc_finalized || d_state == desc_reusable) { - /* - * Begin searching after the last finalized record. - * - * On 0, the search must begin at 0 because of hack#2 - * of the bootstrapping phase it is not known if a - * record at index 0 exists. - */ - if (seq != 0) - seq++; - } else { - /* - * The information about the last finalized sequence number - * has gone. It should happen only when there is a flood of - * new messages and the ringbuffer is rapidly recycled. - * Give up and start from the beginning. - */ - seq = 0; - } + /* + * Begin searching after the last finalized record. + * + * On 0, the search must begin at 0 because of hack#2 + * of the bootstrapping phase it is not known if a + * record at index 0 exists. + */ + if (seq != 0) + seq++; /* * The information about the last finalized @seq might be inaccurate. @@ -2085,7 +2314,7 @@ void prb_init(struct printk_ringbuffer *rb, rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); - atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 18cd25e489b8..52626d0f1fa3 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -75,7 +75,7 @@ struct prb_desc_ring { struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; - atomic_long_t last_finalized_id; + atomic_long_t last_finalized_seq; }; /* @@ -127,8 +127,22 @@ enum desc_state { #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) + +/* + * Special data block logical position values (for fields of + * @prb_desc.text_blk_lpos). + * + * - Bit0 is used to identify if the record has no data block. (Implemented in + * the LPOS_DATALESS() macro.) + * + * - Bit1 specifies the reason for not having a data block. + * + * These special values could never be real lpos values because of the + * meta data and alignment padding of data blocks. (See to_blk_size() for + * details.) + */ #define FAILED_LPOS 0x1 -#define NO_LPOS 0x3 +#define EMPTY_LINE_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ @@ -259,7 +273,7 @@ static struct printk_ringbuffer name = { \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ - .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .last_finalized_seq = ATOMIC_INIT(0), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ @@ -378,7 +392,41 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count); +u64 prb_first_seq(struct printk_ringbuffer *rb); u64 prb_first_valid_seq(struct printk_ringbuffer *rb); u64 prb_next_seq(struct printk_ringbuffer *rb); +u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); + +#ifdef CONFIG_64BIT + +#define __u64seq_to_ulseq(u64seq) (u64seq) +#define __ulseq_to_u64seq(rb, ulseq) (ulseq) + +#else /* CONFIG_64BIT */ + +#define __u64seq_to_ulseq(u64seq) ((u32)u64seq) + +static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) +{ + u64 rb_first_seq = prb_first_seq(rb); + u64 seq; + + /* + * The provided sequence is only the lower 32 bits of the ringbuffer + * sequence. It needs to be expanded to 64bit. Get the first sequence + * number from the ringbuffer and fold it. + * + * Having a 32bit representation in the console is sufficient. + * If a console ever gets more than 2^31 records behind + * the ringbuffer then this is the least of the problems. + * + * Also the access to the ring buffer is always safe. + */ + seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq); + + return seq; +} + +#endif /* CONFIG_64BIT */ #endif /* _KERNEL_PRINTK_RINGBUFFER_H */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2fabd497d659..d5f89f9ef29f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data) return 0; } -static inline void ptrace_set_stopped(struct task_struct *task) +static inline void ptrace_set_stopped(struct task_struct *task, bool seize) { guard(spinlock)(&task->sighand->siglock); + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID); /* * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING @@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request, return -EPERM; task->ptrace = flags; - ptrace_link(task, current); - - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - - ptrace_set_stopped(task); + ptrace_set_stopped(task, seize); } } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 929fce69f555..0621e4ee31de 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6647,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -6718,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8e50fbac345..e8270e2e15cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1831,6 +1831,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int last_cpupid, this_cpupid; /* + * Cannot migrate to memoryless nodes. + */ + if (!node_state(dst_nid, N_MEMORY)) + return false; + + /* * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 4e715b9b278e..809194cd779f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -254,7 +254,7 @@ static int membarrier_global_expedited(void) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -304,7 +304,7 @@ static int membarrier_global_expedited(void) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ @@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; ipi_func = ipi_sync_core; + prepare_sync_core_cmd(mm); } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ smp_mb(); /* system call entry is not a mb. */ @@ -420,7 +425,7 @@ out: /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ diff --git a/kernel/signal.c b/kernel/signal.c index bdca529f0f7b..7bdbcf1b78d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2741,12 +2741,15 @@ relock: /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { - clear_siginfo(&ksig->info); - ksig->info.si_signo = signr = SIGKILL; + signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); + &sighand->action[SIGKILL-1]); recalc_sigpending(); + /* + * implies do_group_exit() or return to PF_USER_WORKER, + * no need to initialize ksig->info/etc. + */ goto fatal; } @@ -2856,7 +2859,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(ksig->info.si_signo))) { + if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } @@ -2880,7 +2883,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(ksig->info.si_signo); + print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2895,8 +2898,9 @@ relock: /* * PF_USER_WORKER threads will catch and exit on fatal signals - * themselves. They have cleanup that must be performed, so - * we cannot call do_exit() on their behalf. + * themselves. They have cleanup that must be performed, so we + * cannot call do_exit() on their behalf. Note that ksig won't + * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; @@ -2904,17 +2908,17 @@ relock: /* * Death signals, no core dump. */ - do_group_exit(ksig->info.si_signo); + do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); -out: + ksig->sig = signr; - if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) + if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); - - return ksig->sig > 0; +out: + return signr > 0; } /** diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 157f7ce2942d..81cc974913bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1710,9 +1710,9 @@ static struct ctl_table kern_table[] = { { .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), + .maxlen = MAX_TRACER_SIZE, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dostring, }, { .procname = "traceoff_on_warning", diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4657cb8e8b1f..5abfa4390673 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -134,7 +134,7 @@ static struct class_interface alarmtimer_rtc_interface = { static int alarmtimer_rtc_interface_setup(void) { - alarmtimer_rtc_interface.class = rtc_class; + alarmtimer_rtc_interface.class = &rtc_class; return class_interface_register(&alarmtimer_rtc_interface); } static void alarmtimer_rtc_interface_remove(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ff49ddcc9800..3b57a73f249a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -642,7 +642,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * the base lock: */ if (base->is_idle) { - WARN_ON_ONCE(!(timer->flags & TIMER_PINNED)); + WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || + tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); } } @@ -2292,6 +2293,13 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, */ if (!base_local->is_idle && time_after(nextevt, basej + 1)) { base_local->is_idle = true; + /* + * Global timers queued locally while running in a task + * in nohz_full mode need a self-IPI to kick reprogramming + * in IRQ tail. + */ + if (tick_nohz_full_cpu(base_local->cpu)) + base_global->is_idle = true; trace_timer_base_idle(true, base_local->cpu); } *idle = base_local->is_idle; @@ -2364,6 +2372,8 @@ void timer_clear_idle(void) * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); + if (tick_nohz_full_cpu(smp_processor_id())) + __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); trace_timer_base_idle(false, smp_processor_id()); /* Activate without holding the timer_base->lock */ diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8f49b6b96dfd..c63a0afdcebe 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -751,26 +751,6 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, first_childevt = evt = data->evt; - /* - * Walking the hierarchy is required in any case when a - * remote expiry was done before. This ensures to not lose - * already queued events in non active groups (see section - * "Required event and timerqueue update after a remote - * expiry" in the documentation at the top). - * - * The two call sites which are executed without a remote expiry - * before, are not prevented from propagating changes through - * the hierarchy by the return: - * - When entering this path by tmigr_new_timer(), @evt->ignore - * is never set. - * - tmigr_inactive_up() takes care of the propagation by - * itself and ignores the return value. But an immediate - * return is required because nothing has to be done in this - * level as the event could be ignored. - */ - if (evt->ignore && !remote) - return true; - raw_spin_lock(&group->lock); childstate.state = 0; @@ -1058,8 +1038,15 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) - return; + if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + /* + * If this CPU was an idle migrator, make sure to clear its wakeup + * value so it won't chase timers that have already expired elsewhere. + * This avoids endless requeue from tmigr_new_timer(). + */ + if (READ_ONCE(tmc->wakeup) == KTIME_MAX) + return; + } data.now = get_jiffies_update(&data.basej); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7ac6c52b25eb..0a5c4efc73c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1412,14 +1412,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, __bpf_kfunc_end_defs(); -BTF_SET8_START(key_sig_kfunc_set) +BTF_KFUNCS_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif -BTF_SET8_END(key_sig_kfunc_set) +BTF_KFUNCS_END(key_sig_kfunc_set) static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { .owner = THIS_MODULE, @@ -1475,9 +1475,9 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, __bpf_kfunc_end_defs(); -BTF_SET8_START(fs_kfunc_set_ids) +BTF_KFUNCS_START(fs_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_SET8_END(fs_kfunc_set_ids) +BTF_KFUNCS_END(fs_kfunc_set_ids) static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) { @@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2679,6 +2679,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { + u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies); u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); struct bpf_kprobe_multi_link *kmulti_link; u32 ucount = info->kprobe_multi.count; @@ -2686,6 +2687,8 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, if (!uaddrs ^ !ucount) return -EINVAL; + if (ucookies && !ucount) + return -EINVAL; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; @@ -2699,6 +2702,18 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, else ucount = kmulti_link->cnt; + if (ucookies) { + if (kmulti_link->cookies) { + if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64))) + return -EFAULT; + } else { + for (i = 0; i < ucount; i++) { + if (put_user(0, ucookies + i)) + return -EFAULT; + } + } + } + if (kallsyms_show_value(current_cred())) { if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) return -EFAULT; @@ -3241,7 +3256,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; int err = 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83ba342aef31..da1710499698 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1160,7 +1160,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) * Search a given @hash to see if a given instruction pointer (@ip) * exists in it. * - * Returns the entry that holds the @ip if found. NULL otherwise. + * Returns: the entry that holds the @ip if found. NULL otherwise. */ struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1282,7 +1282,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) /** * ftrace_free_filter - remove all filters for an ftrace_ops - * @ops - the ops to remove the filters from + * @ops: the ops to remove the filters from */ void ftrace_free_filter(struct ftrace_ops *ops) { @@ -1587,7 +1587,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) * @end: end of range to search (inclusive). @end points to the last byte * to check. * - * Returns rec->ip if the related ftrace location is a least partly within + * Returns: rec->ip if the related ftrace location is a least partly within * the given address range. That is, the first address of the instruction * that is either a NOP or call to the function tracer. It checks the ftrace * internal tables to determine if the address belongs or not. @@ -1607,9 +1607,10 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * If @ip matches the ftrace location, return @ip. - * If @ip matches sym+0, return sym's ftrace location. - * Otherwise, return 0. + * Returns: + * * If @ip matches the ftrace location, return @ip. + * * If @ip matches sym+0, return sym's ftrace location. + * * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { @@ -1639,7 +1640,7 @@ out: * @start: start of range to search * @end: end of range to search (inclusive). @end points to the last byte to check. * - * Returns 1 if @start and @end contains a ftrace location. + * Returns: 1 if @start and @end contains a ftrace location. * That is, the instruction that is either a NOP or call to * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. @@ -2574,7 +2575,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS * is not set, then it wants to convert to the normal callback. * - * Returns the address of the trampoline to set to + * Returns: the address of the trampoline to set to */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { @@ -2615,7 +2616,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) * a function that saves all the regs. Basically the '_EN' version * represents the current state of the function. * - * Returns the address of the trampoline that is currently being called + * Returns: the address of the trampoline that is currently being called */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { @@ -2719,7 +2720,7 @@ struct ftrace_rec_iter { /** * ftrace_rec_iter_start - start up iterating over traced functions * - * Returns an iterator handle that is used to iterate over all + * Returns: an iterator handle that is used to iterate over all * the records that represent address locations where functions * are traced. * @@ -2751,7 +2752,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * - * Returns the next iterator after the given iterator @iter. + * Returns: the next iterator after the given iterator @iter. */ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) { @@ -2776,7 +2777,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * - * Returns the record that the current @iter is at. + * Returns: the record that the current @iter is at. */ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) { @@ -4010,6 +4011,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file) * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). + * + * Returns: 0 on success or a negative errno value on failure */ int ftrace_regex_open(struct ftrace_ops *ops, int flag, @@ -4626,7 +4629,7 @@ struct ftrace_func_mapper { /** * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper * - * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data. */ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { @@ -4646,7 +4649,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) * @mapper: The mapper that has the ip maps * @ip: the instruction pointer to find the data for * - * Returns the data mapped to @ip if found otherwise NULL. The return + * Returns: the data mapped to @ip if found otherwise NULL. The return * is actually the address of the mapper data pointer. The address is * returned for use cases where the data is no bigger than a long, and * the user can use the data pointer as its data instead of having to @@ -4672,7 +4675,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on success otherwise an error. + * Returns: 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4701,7 +4704,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @mapper: The mapper that has the ip maps * @ip: The instruction pointer address to remove the data from * - * Returns the data if it is found, otherwise NULL. + * Returns: the data if it is found, otherwise NULL. * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. @@ -5625,10 +5628,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address - * @ops - the ops to set the filter with - * @ip - the address to add to or remove from the filter. - * @remove - non zero to remove the ip from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ip: the address to add to or remove from the filter. + * @remove: non zero to remove the ip from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. @@ -5647,11 +5650,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses - * @ops - the ops to set the filter with - * @ips - the array of addresses to add to or remove from the filter. - * @cnt - the number of addresses in @ips - * @remove - non zero to remove ips from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ips: the array of addresses to add to or remove from the filter. + * @cnt: the number of addresses in @ips + * @remove: non zero to remove ips from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. @@ -5670,7 +5673,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); /** * ftrace_ops_set_global_filter - setup ops to use global filters - * @ops - the ops which will use the global filters + * @ops: the ops which will use the global filters * * ftrace users who need global function trace filtering should call this. * It can set the global filter only if ops were not initialized before. @@ -5694,10 +5697,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, /** * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5716,10 +5719,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the notrace filter with + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -5738,9 +5741,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** * ftrace_set_global_filter - set a function to filter on with global tracers - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5753,9 +5756,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_global_notrace - set a function to not trace with global tracers - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -7443,7 +7446,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); * have its own recursion protection, then it should call the * ftrace_ops_assist_func() instead. * - * Returns the function that the trampoline should call for @ops. + * Returns: the function that the trampoline should call for @ops. */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { @@ -7897,7 +7900,7 @@ void ftrace_kill(void) /** * ftrace_is_dead - Test if ftrace is dead or not. * - * Returns 1 if ftrace is "dead", zero otherwise. + * Returns: 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { @@ -8142,8 +8145,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * - * This function returns 0 if all provided symbols are found, - * -ESRCH otherwise. + * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3103a484182e..25476ead681b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -384,6 +384,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -753,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + /* For waiters waiting for the first wake up */ + (void)atomic_fetch_inc_release(&rbwork->seq); + wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ @@ -834,51 +838,24 @@ static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; + if (!ret && (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full)) { + cpu_buffer->shortest_full = full; + } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } -/** - * ring_buffer_wait - wait for input to the ring buffer - * @buffer: buffer to wait on - * @cpu: the cpu buffer to wait on - * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS - * - * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon - * as data is added to any of the @buffer's cpu buffers. Otherwise - * it will wait for data to be added to a specific cpu buffer. - */ -int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) +static inline bool +rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, + int cpu, int full, ring_buffer_cond_fn cond, void *data) { - struct ring_buffer_per_cpu *cpu_buffer; - DEFINE_WAIT(wait); - struct rb_irq_work *work; - int ret = 0; - - /* - * Depending on what the caller is waiting for, either any - * data in any cpu buffer, or a specific buffer, put the - * caller on the appropriate wait queue. - */ - if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; - /* Full only makes sense on per cpu reads */ - full = 0; - } else { - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return -ENODEV; - cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; - } + if (rb_watermark_hit(buffer, cpu, full)) + return true; - if (full) - prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); - else - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + if (cond(data)) + return true; /* * The events can happen in critical sections where @@ -901,27 +878,82 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * a task has been queued. It's OK for spurious wake ups. */ if (full) - work->full_waiters_pending = true; + rbwork->full_waiters_pending = true; else - work->waiters_pending = true; + rbwork->waiters_pending = true; - if (rb_watermark_hit(buffer, cpu, full)) - goto out; + return false; +} - if (signal_pending(current)) { - ret = -EINTR; - goto out; +struct rb_wait_data { + struct rb_irq_work *irq_work; + int seq; +}; + +/* + * The default wait condition for ring_buffer_wait() is to just to exit the + * wait loop the first time it is woken up. + */ +static bool rb_wait_once(void *data) +{ + struct rb_wait_data *rdata = data; + struct rb_irq_work *rbwork = rdata->irq_work; + + return atomic_read_acquire(&rbwork->seq) != rdata->seq; +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * @cond: condition function to break out of wait (NULL to run once) + * @data: the data to pass to @cond. + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, + ring_buffer_cond_fn cond, void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct wait_queue_head *waitq; + struct rb_irq_work *rbwork; + struct rb_wait_data rdata; + int ret = 0; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) { + rbwork = &buffer->irq_work; + /* Full only makes sense on per cpu reads */ + full = 0; + } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; + cpu_buffer = buffer->buffers[cpu]; + rbwork = &cpu_buffer->irq_work; } - schedule(); - out: if (full) - finish_wait(&work->full_waiters, &wait); + waitq = &rbwork->full_waiters; else - finish_wait(&work->waiters, &wait); + waitq = &rbwork->waiters; + + /* Set up to exit loop as soon as it is woken */ + if (!cond) { + cond = rb_wait_once; + rdata.irq_work = rbwork; + rdata.seq = atomic_read_acquire(&rbwork->seq); + data = &rdata; + } - if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current)) - ret = -EINTR; + ret = wait_event_interruptible((*waitq), + rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } @@ -959,21 +991,30 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, } if (full) { - unsigned long flags; - poll_wait(filp, &rbwork->full_waiters, poll_table); - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + if (rb_watermark_hit(buffer, cpu, full)) + return EPOLLIN | EPOLLRDNORM; + /* + * Only allow full_waiters_pending update to be seen after + * the shortest_full is set (in rb_watermark_hit). If the + * writer sees the full_waiters_pending flag set, it will + * compare the amount in the ring buffer to shortest_full. + * If the amount in the ring buffer is greater than the + * shortest_full percent, it will call the irq_work handler + * to wake up this list. The irq_handler will reset shortest_full + * back to zero. That's done under the reader_lock, but + * the below smp_mb() makes sure that the update to + * full_waiters_pending doesn't leak up into the above. + */ + smp_mb(); rbwork->full_waiters_pending = true; - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - } else { - poll_wait(filp, &rbwork->waiters, poll_table); - rbwork->waiters_pending = true; + return 0; } + poll_wait(filp, &rbwork->waiters, poll_table); + rbwork->waiters_pending = true; + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -989,9 +1030,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); - if (full) - return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -1485,7 +1523,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1570,7 +1609,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -4350,7 +4390,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* @@ -5538,7 +5578,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c9c898307348..233d1af39fff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -13,7 +13,7 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> -#include <generated/utsrelease.h> +#include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -39,7 +39,6 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> -#include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> @@ -105,7 +104,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_taskinfo_save); +DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -131,9 +130,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops + * Set instance name if you want to dump the specific trace instance + * Multiple instance dump is also supported, and instances are seperated + * by commas. */ - -enum ftrace_dump_mode ftrace_dump_on_oops; +/* Set to string format zero to disable by default */ +char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; @@ -179,7 +181,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); -#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; @@ -202,19 +203,33 @@ static int __init set_cmdline_ftrace(char *str) } __setup("ftrace=", set_cmdline_ftrace); +int ftrace_dump_on_oops_enabled(void) +{ + if (!strcmp("0", ftrace_dump_on_oops)) + return 0; + else + return 1; +} + static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str || !strcmp("1", str)) { - ftrace_dump_on_oops = DUMP_ALL; + if (!*str) { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } - if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } + if (*str == ',') { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); + strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); + return 1; + } - return 0; + if (*str++ == '=') { + strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -1301,6 +1316,50 @@ static void free_snapshot(struct trace_array *tr) tr->allocated_snapshot = false; } +static int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_arm_snapshot_locked(tr); + mutex_unlock(&trace_types_lock); + + return ret; +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + /** * tracing_alloc_snapshot - allocate snapshot buffer. * @@ -1374,10 +1433,6 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, mutex_lock(&trace_types_lock); - ret = tracing_alloc_snapshot_instance(tr); - if (ret) - goto fail_unlock; - if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto fail_unlock; @@ -1396,6 +1451,10 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + ret = tracing_arm_snapshot_locked(tr); + if (ret) + goto fail_unlock; + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; @@ -1440,6 +1499,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) arch_spin_unlock(&tr->max_lock); local_irq_enable(); + tracing_disarm_snapshot(tr); + return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); @@ -1482,6 +1543,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) +#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1955,15 +2017,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) #endif /* CONFIG_TRACER_MAX_TRACE */ +struct pipe_wait { + struct trace_iterator *iter; + int wait_index; +}; + +static bool wait_pipe_cond(void *data) +{ + struct pipe_wait *pwait = data; + struct trace_iterator *iter = pwait->iter; + + if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) + return true; + + return iter->closed; +} + static int wait_on_pipe(struct trace_iterator *iter, int full) { + struct pipe_wait pwait; int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + pwait.wait_index = atomic_read_acquire(&iter->wait_index); + pwait.iter = iter; + + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, + wait_pipe_cond, &pwait); #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -2299,98 +2382,6 @@ void tracing_reset_all_online_cpus(void) mutex_unlock(&trace_types_lock); } -/* - * The tgid_map array maps from pid to tgid; i.e. the value stored at index i - * is the tgid last observed corresponding to pid=i. - */ -static int *tgid_map; - -/* The maximum valid index into tgid_map. */ -static size_t tgid_map_max; - -#define SAVED_CMDLINES_DEFAULT 128 -#define NO_CMDLINE_MAP UINT_MAX -/* - * Preemption must be disabled before acquiring trace_cmdline_lock. - * The various trace_arrays' max_lock must be acquired in a context - * where interrupt is disabled. - */ -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; -struct saved_cmdlines_buffer { - unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; - unsigned *map_cmdline_to_pid; - unsigned cmdline_num; - int cmdline_idx; - char saved_cmdlines[]; -}; -static struct saved_cmdlines_buffer *savedcmd; - -static inline char *get_saved_cmdlines(int idx) -{ - return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; -} - -static inline void set_cmdline(int idx, const char *cmdline) -{ - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); -} - -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); - - kfree(s->map_cmdline_to_pid); - kmemleak_free(s); - free_pages((unsigned long)s, order); -} - -static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) -{ - struct saved_cmdlines_buffer *s; - struct page *page; - int orig_size, size; - int order; - - /* Figure out how much is needed to hold the given number of cmdlines */ - orig_size = sizeof(*s) + val * TASK_COMM_LEN; - order = get_order(orig_size); - size = 1 << (order + PAGE_SHIFT); - page = alloc_pages(GFP_KERNEL, order); - if (!page) - return NULL; - - s = page_address(page); - kmemleak_alloc(s, size, 1, GFP_KERNEL); - memset(s, 0, sizeof(*s)); - - /* Round up to actual allocation */ - val = (size - sizeof(*s)) / TASK_COMM_LEN; - s->cmdline_num = val; - - s->map_cmdline_to_pid = kmalloc_array(val, - sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); - if (!s->map_cmdline_to_pid) { - free_saved_cmdlines_buffer(s); - return NULL; - } - - s->cmdline_idx = 0; - memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, - sizeof(s->map_pid_to_cmdline)); - memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, - val * sizeof(*s->map_cmdline_to_pid)); - - return s; -} - -static int trace_create_savedcmd(void) -{ - savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - - return savedcmd ? 0 : -ENOMEM; -} - int is_tracing_stopped(void) { return global_trace.stop_count; @@ -2483,201 +2474,6 @@ void tracing_stop(void) return tracing_stop_tr(&global_trace); } -static int trace_save_cmdline(struct task_struct *tsk) -{ - unsigned tpid, idx; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - tpid = tsk->pid & (PID_MAX_DEFAULT - 1); - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - * - * This is called within the scheduler and wake up, so interrupts - * had better been disabled and run queue lock been held. - */ - lockdep_assert_preemption_disabled(); - if (!arch_spin_trylock(&trace_cmdline_lock)) - return 0; - - idx = savedcmd->map_pid_to_cmdline[tpid]; - if (idx == NO_CMDLINE_MAP) { - idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - - savedcmd->map_pid_to_cmdline[tpid] = idx; - savedcmd->cmdline_idx = idx; - } - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - set_cmdline(idx, tsk->comm); - - arch_spin_unlock(&trace_cmdline_lock); - - return 1; -} - -static void __trace_find_cmdline(int pid, char comm[]) -{ - unsigned map; - int tpid; - - if (!pid) { - strcpy(comm, "<idle>"); - return; - } - - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, "<XXX>"); - return; - } - - tpid = pid & (PID_MAX_DEFAULT - 1); - map = savedcmd->map_pid_to_cmdline[tpid]; - if (map != NO_CMDLINE_MAP) { - tpid = savedcmd->map_cmdline_to_pid[map]; - if (tpid == pid) { - strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - return; - } - } - strcpy(comm, "<...>"); -} - -void trace_find_cmdline(int pid, char comm[]) -{ - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - __trace_find_cmdline(pid, comm); - - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int *trace_find_tgid_ptr(int pid) -{ - /* - * Pairs with the smp_store_release in set_tracer_flag() to ensure that - * if we observe a non-NULL tgid_map then we also observe the correct - * tgid_map_max. - */ - int *map = smp_load_acquire(&tgid_map); - - if (unlikely(!map || pid > tgid_map_max)) - return NULL; - - return &map[pid]; -} - -int trace_find_tgid(int pid) -{ - int *ptr = trace_find_tgid_ptr(pid); - - return ptr ? *ptr : 0; -} - -static int trace_save_tgid(struct task_struct *tsk) -{ - int *ptr; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - ptr = trace_find_tgid_ptr(tsk->pid); - if (!ptr) - return 0; - - *ptr = tsk->tgid; - return 1; -} - -static bool tracing_record_taskinfo_skip(int flags) -{ - if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) - return true; - if (!__this_cpu_read(trace_taskinfo_save)) - return true; - return false; -} - -/** - * tracing_record_taskinfo - record the task info of a task - * - * @task: task to record - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo(struct task_struct *task, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/** - * tracing_record_taskinfo_sched_switch - record task info for sched_switch - * - * @prev: previous task during sched_switch - * @next: next task during sched_switch - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo_sched_switch(struct task_struct *prev, - struct task_struct *next, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); - done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/* Helpers to record a specific task information */ -void tracing_record_cmdline(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); -} - -void tracing_record_tgid(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_TGID); -} - /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function @@ -4368,7 +4164,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); + name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" @@ -5436,8 +5232,6 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { - int *map; - if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD)) lockdep_assert_held(&event_mutex); @@ -5460,20 +5254,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { - if (!tgid_map) { - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); - /* - * Pairs with smp_load_acquire() in - * trace_find_tgid_ptr() to ensure that if it observes - * the tgid_map we just allocated then it also observes - * the corresponding tgid_map_max value. - */ - smp_store_release(&tgid_map, map); - } - if (!tgid_map) { + if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } @@ -5747,16 +5529,15 @@ static const char readme_msg[] = "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API -#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t <argname>[->field[->field|.field...]],\n" -#else - "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" + "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t symstr, <type>\\[<array-size>\\]\n" @@ -5918,207 +5699,6 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) -{ - int pid = ++(*pos); - - return trace_find_tgid_ptr(pid); -} - -static void *saved_tgids_start(struct seq_file *m, loff_t *pos) -{ - int pid = *pos; - - return trace_find_tgid_ptr(pid); -} - -static void saved_tgids_stop(struct seq_file *m, void *v) -{ -} - -static int saved_tgids_show(struct seq_file *m, void *v) -{ - int *entry = (int *)v; - int pid = entry - tgid_map; - int tgid = *entry; - - if (tgid == 0) - return SEQ_SKIP; - - seq_printf(m, "%d %d\n", pid, tgid); - return 0; -} - -static const struct seq_operations tracing_saved_tgids_seq_ops = { - .start = saved_tgids_start, - .stop = saved_tgids_stop, - .next = saved_tgids_next, - .show = saved_tgids_show, -}; - -static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_tgids_seq_ops); -} - - -static const struct file_operations tracing_saved_tgids_fops = { - .open = tracing_saved_tgids_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) -{ - unsigned int *ptr = v; - - if (*pos || m->count) - ptr++; - - (*pos)++; - - for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; - ptr++) { - if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) - continue; - - return ptr; - } - - return NULL; -} - -static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) -{ - void *v; - loff_t l = 0; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - v = &savedcmd->map_cmdline_to_pid[0]; - while (l <= *pos) { - v = saved_cmdlines_next(m, v, &l); - if (!v) - return NULL; - } - - return v; -} - -static void saved_cmdlines_stop(struct seq_file *m, void *v) -{ - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int saved_cmdlines_show(struct seq_file *m, void *v) -{ - char buf[TASK_COMM_LEN]; - unsigned int *pid = v; - - __trace_find_cmdline(*pid, buf); - seq_printf(m, "%d %s\n", *pid, buf); - return 0; -} - -static const struct seq_operations tracing_saved_cmdlines_seq_ops = { - .start = saved_cmdlines_start, - .next = saved_cmdlines_next, - .stop = saved_cmdlines_stop, - .show = saved_cmdlines_show, -}; - -static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_cmdlines_seq_ops); -} - -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_saved_cmdlines_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static ssize_t -tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static int tracing_resize_saved_cmdlines(unsigned int val) -{ - struct saved_cmdlines_buffer *s, *savedcmd_temp; - - s = allocate_cmdlines_buffer(val); - if (!s) - return -ENOMEM; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - savedcmd_temp = savedcmd; - savedcmd = s; - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - free_saved_cmdlines_buffer(savedcmd_temp); - - return 0; -} - -static ssize_t -tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - /* must have at least 1 entry or less than PID_MAX_DEFAULT */ - if (!val || val > PID_MAX_DEFAULT) - return -EINVAL; - - ret = tracing_resize_saved_cmdlines((unsigned int)val); - if (ret < 0) - return ret; - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations tracing_saved_cmdlines_size_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_size_read, - .write = tracing_saved_cmdlines_size_write, -}; - #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) @@ -6595,11 +6175,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) */ synchronize_rcu(); free_snapshot(tr); + tracing_disarm_snapshot(tr); } - if (t->use_max_tr && !tr->allocated_snapshot) { - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + if (!had_max_tr && t->use_max_tr) { + ret = tracing_arm_snapshot_locked(tr); + if (ret) goto out; } #else @@ -6608,8 +6189,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->init) { ret = tracer_init(t, tr); - if (ret) + if (ret) { +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr) + tracing_disarm_snapshot(tr); +#endif goto out; + } } tr->current_trace = t; @@ -7711,10 +7297,11 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); - else - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + + ret = tracing_arm_snapshot_locked(tr); + if (ret) break; + /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); @@ -7724,6 +7311,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } + tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { @@ -8398,9 +7986,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - iter->wait_index++; + iter->closed = true; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -8500,6 +8088,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + bool woken = false; int page_size; int entries, i; ssize_t ret = 0; @@ -8573,17 +8162,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - long wait_index; if (ret) goto out; + if (woken) + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; @@ -8592,10 +8181,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!tracer_tracing_is_on(iter->tr)) goto out; - /* Make sure we see the new wait_index */ - smp_rmb(); - if (wait_index != iter->wait_index) - goto out; + /* Iterate one more time to collect any new data then exit */ + woken = true; goto again; } @@ -8618,9 +8205,8 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned mutex_lock(&trace_types_lock); - iter->wait_index++; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -8857,8 +8443,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } if (!param) goto out_reg; @@ -8877,12 +8468,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = tracing_alloc_snapshot_instance(tr); + ret = tracing_arm_snapshot(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); - + if (ret < 0) + tracing_disarm_snapshot(tr); out: return ret < 0 ? ret : 0; } @@ -9689,7 +9281,9 @@ trace_array_create_systems(const char *name, const char *systems) raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&tr->snapshot_trigger_lock); +#endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -10254,14 +9848,14 @@ static struct notifier_block trace_die_notifier = { static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { - if (!ftrace_dump_on_oops) + if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; - ftrace_dump(ftrace_dump_on_oops); + ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } @@ -10302,12 +9896,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void trace_init_global_iter(struct trace_iterator *iter) +static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { - iter->tr = &global_trace; + iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->array_buffer = &global_trace.array_buffer; + iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -10327,22 +9921,19 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->fmt_size = STATIC_FMT_BUF_SIZE; } -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +void trace_init_global_iter(struct trace_iterator *iter) +{ + trace_init_iter(iter, &global_trace); +} + +static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static atomic_t dump_running; - struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; - /* Only allow one dump user at a time. */ - if (atomic_inc_return(&dump_running) != 1) { - atomic_dec(&dump_running); - return; - } - /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens @@ -10351,12 +9942,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ - tracing_off(); + tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ - trace_init_global_iter(&iter); + trace_init_iter(&iter, tr); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -10367,21 +9958,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = RING_BUFFER_ALL_CPUS; - break; - case DUMP_ORIG: + if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + else iter.cpu_file = RING_BUFFER_ALL_CPUS; - } - printk(KERN_TRACE "Dumping ftrace buffer:\n"); + if (tr == &global_trace) + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + else + printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { @@ -10423,15 +10008,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) else printk(KERN_TRACE "---------------------------------\n"); - out_enable: tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); local_irq_restore(flags); } + +static void ftrace_dump_by_param(void) +{ + bool first_param = true; + char dump_param[MAX_TRACER_SIZE]; + char *buf, *token, *inst_name; + struct trace_array *tr; + + strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); + buf = dump_param; + + while ((token = strsep(&buf, ",")) != NULL) { + if (first_param) { + first_param = false; + if (!strcmp("0", token)) + continue; + else if (!strcmp("1", token)) { + ftrace_dump_one(&global_trace, DUMP_ALL); + continue; + } + else if (!strcmp("2", token) || + !strcmp("orig_cpu", token)) { + ftrace_dump_one(&global_trace, DUMP_ORIG); + continue; + } + } + + inst_name = strsep(&token, "="); + tr = trace_array_find(inst_name); + if (!tr) { + printk(KERN_TRACE "Instance %s not found\n", inst_name); + continue; + } + + if (token && (!strcmp("2", token) || + !strcmp("orig_cpu", token))) + ftrace_dump_one(tr, DUMP_ORIG); + else + ftrace_dump_one(tr, DUMP_ALL); + } +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + static atomic_t dump_running; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + switch (oops_dump_mode) { + case DUMP_ALL: + ftrace_dump_one(&global_trace, DUMP_ALL); + break; + case DUMP_ORIG: + ftrace_dump_one(&global_trace, DUMP_ORIG); + break; + case DUMP_PARAM: + ftrace_dump_by_param(); + break; + case DUMP_NONE: + break; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + ftrace_dump_one(&global_trace, DUMP_ALL); + } + + atomic_dec(&dump_running); +} EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 @@ -10659,7 +10313,9 @@ __init static int tracer_alloc_buffers(void) global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&global_trace.snapshot_trigger_lock); +#endif ftrace_init_global_array_ops(&global_trace); init_trace_flags_index(&global_trace); @@ -10696,7 +10352,7 @@ __init static int tracer_alloc_buffers(void) out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: - free_saved_cmdlines_buffer(savedcmd); + trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 00f873910c5d..64450615ca0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,8 +334,8 @@ struct trace_array { */ struct array_buffer max_buffer; bool allocated_snapshot; -#endif -#ifdef CONFIG_TRACER_MAX_TRACE + spinlock_t snapshot_trigger_lock; + unsigned int snapshot; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -1375,6 +1375,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } +DECLARE_PER_CPU(bool, trace_taskinfo_save); +int trace_save_cmdline(struct task_struct *tsk); +int trace_create_savedcmd(void); +int trace_alloc_tgid_map(void); +void trace_free_saved_cmdlines_buffer(void); + +extern const struct file_operations tracing_saved_cmdlines_fops; +extern const struct file_operations tracing_saved_tgids_fops; +extern const struct file_operations tracing_saved_cmdlines_size_fops; + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); @@ -1973,12 +1983,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len) #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot(struct trace_array *tr); +void tracing_disarm_snapshot(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } +static inline void tracing_disarm_snapshot(struct trace_array *tr) { } #endif #ifdef CONFIG_PREEMPT_TRACER diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 54d5fa35c90a..811b08439406 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -92,7 +92,6 @@ static void trace_do_benchmark(void) bm_total += delta; bm_totalsq += delta * delta; - if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: @@ -105,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - do_div(delta, bm_cnt); + delta = div64_u64(delta, bm_cnt); avg = delta; if (stddev > 0) { @@ -127,7 +126,7 @@ static void trace_do_benchmark(void) seed = stddev; if (!last_seed) break; - do_div(seed, last_seed); + seed = div64_u64(seed, last_seed); seed += last_seed; do_div(seed, 2); } while (i++ < 10 && last_seed != seed); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 03c851f57969..b0e0ec85912e 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -220,7 +220,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group, if (!ep->event_system) goto error; - ret = trace_probe_init(&ep->tp, this_event, group, false); + ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) goto error; @@ -390,8 +390,8 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { unsigned long val; int ret; @@ -438,7 +438,7 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + store_trace_args(&entry[1], &edata->ep->tp, rec, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b33c3861fbbb..4bec043c8690 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -597,20 +597,12 @@ out: return ret; } -/** - * unregister_trigger - Generic event_command @unreg implementation - * @glob: The raw string used to register the trigger - * @test: Trigger-specific data used to find the trigger to remove - * @file: The trace_event_file associated with the event - * - * Common implementation for event trigger unregistration. - * - * Usually used directly as the @unreg method in event command - * implementations. +/* + * True if the trigger was found and unregistered, else false. */ -static void unregister_trigger(char *glob, - struct event_trigger_data *test, - struct trace_event_file *file) +static bool try_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data = NULL, *iter; @@ -626,8 +618,32 @@ static void unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); + if (data) { + if (data->ops->free) + data->ops->free(data); + + return true; + } + + return false; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + try_unregister_trigger(glob, test, file); } /* @@ -1470,12 +1486,23 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = tracing_alloc_snapshot_instance(file->tr); + int ret = tracing_arm_snapshot(file->tr); if (ret < 0) return ret; - return register_trigger(glob, data, file); + ret = register_trigger(glob, data, file); + if (ret < 0) + tracing_disarm_snapshot(file->tr); + return ret; +} + +static void unregister_snapshot_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + if (try_unregister_trigger(glob, data, file)) + tracing_disarm_snapshot(file->tr); } static int @@ -1510,7 +1537,7 @@ static struct event_command trigger_snapshot_cmd = { .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, - .unreg = unregister_trigger, + .unreg = unregister_snapshot_trigger, .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, }; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e76f5e1efdf2..70d428c394b6 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + char *system_multi_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within the group for multi-event names */ + u64 multi_id; }; /* Group for init_user_ns mapping, top-most group */ @@ -78,6 +82,7 @@ static unsigned int current_user_events; */ struct user_event { struct user_event_group *group; + char *reg_name; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -127,6 +132,8 @@ struct user_event_enabler { #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) +#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT) + /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { struct work_struct work; @@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -328,6 +337,7 @@ out: static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); + kfree(group->system_multi_name); kfree(group); } @@ -346,6 +356,11 @@ static char *user_event_group_system_name(void) return system_name; } +static char *user_event_group_system_multi_name(void) +{ + return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL); +} + static struct user_event_group *current_user_event_group(void) { return init_group; @@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void) if (!group->system_name) goto error; + group->system_multi_name = user_event_group_system_multi_name(); + + if (!group->system_multi_name) + goto error; + mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user) hash_del(&user->node); user_event_destroy_validators(user); + + /* If we have different names, both must be freed */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + /* + * Single-format events shouldn't return multi-format + * events. Callers expect the underlying tracepoint to match + * the name exactly in these cases. Only check like-formats. + */ + if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags)) + continue; + + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + /* Scan others if this is a multi-format event */ + if (EVENT_MULTI_FORMAT(flags)) + continue; + + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = &user->fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; @@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event, struct user_event *user = container_of(ev, struct user_event, devent); bool match; - match = strcmp(EVENT_NAME(user), event) == 0 && - (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + match = strcmp(EVENT_NAME(user), event) == 0; + + if (match && system) { + match = strcmp(system, user->group->system_name) == 0 || + strcmp(system, user->group->system_multi_name) == 0; + } - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(&user->fields); return match; } @@ -1913,6 +1962,33 @@ static int user_event_trace_register(struct user_event *user) return ret; } +static int user_event_set_tp_name(struct user_event *user) +{ + lockdep_assert_held(&user->group->reg_mutex); + + if (EVENT_MULTI_FORMAT(user->reg_flags)) { + char *multi_name; + + multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", + user->reg_name, user->group->multi_id); + + if (!multi_name) + return -ENOMEM; + + user->call.name = multi_name; + user->tracepoint.name = multi_name; + + /* Inc to ensure unique multi-event name next time */ + user->group->multi_id++; + } else { + /* Non Multi-format uses register name */ + user->call.name = user->reg_name; + user->tracepoint.name = user->reg_name; + } + + return 0; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -1922,11 +1998,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +2011,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(group, name, &key); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, &key); mutex_unlock(&event_mutex); - if (user) { - if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); - if (!argv) { - ret = -ENOMEM; - goto error; - } + if (argv) + argv_free(argv); - ret = user_fields_match(user, argc, (const char **)argv); - argv_free(argv); - - } else - ret = list_empty(&user->fields); - - if (ret) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); - } else { - ret = -EADDRINUSE; - goto error; - } + if (IS_ERR(user)) + return PTR_ERR(user); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); return 0; -error: - user_event_put(user, false); - return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1982,7 +2051,13 @@ error: INIT_LIST_HEAD(&user->validators); user->group = group; - user->tracepoint.name = name; + user->reg_name = name; + user->reg_flags = reg_flags; + + ret = user_event_set_tp_name(user); + + if (ret) + goto put_user; ret = user_event_parse_fields(user, args); @@ -1996,11 +2071,14 @@ error: user->call.data = user; user->call.class = &user->class; - user->call.name = name; user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; - user->class.system = group->system_name; + + if (EVENT_MULTI_FORMAT(user->reg_flags)) + user->class.system = group->system_multi_name; + else + user->class.system = group->system_name; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; @@ -2022,8 +2100,6 @@ error: if (ret) goto put_user_lock; - user->reg_flags = reg_flags; - if (user->reg_flags & USER_EVENT_REG_PERSIST) { /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2047,30 +2123,43 @@ put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); kfree(user->call.print_fmt); + + /* Caller frees reg_name on error, but not multi-name */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user); return ret; } /* - * Deletes a previously created event if it is no longer being used. + * Deletes previously created events if they are no longer being used. */ static int delete_user_event(struct user_event_group *group, char *name) { - u32 key; - struct user_event *user = find_user_event(group, name, &key); + struct user_event *user; + struct hlist_node *tmp; + u32 key = user_event_key(name); + int ret = -ENOENT; - if (!user) - return -ENOENT; + /* Attempt to delete all event(s) with the name passed in */ + hash_for_each_possible_safe(group->register_table, user, tmp, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; - user_event_put(user, true); + if (!user_event_last_ref(user)) + return -EBUSY; - if (!user_event_last_ref(user)) - return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; - if (!user_event_capable(user->reg_flags)) - return -EPERM; + ret = destroy_user_event(user); - return destroy_user_event(user); + if (ret) + goto out; + } +out: + return ret; } /* @@ -2628,7 +2717,7 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - seq_printf(m, "%s", EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_TP_NAME(user)); if (status != 0) seq_puts(m, " #"); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 7d2ddbcfa377..4f4280815522 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,6 +4,7 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt +#include <asm/ptrace.h> #include <linux/fprobe.h> #include <linux/module.h> @@ -129,8 +130,8 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf) * from user space. */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; @@ -152,6 +153,9 @@ retry: case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); break; + case FETCH_OP_EDATA: + val = *(unsigned long *)((unsigned long)edata + code->offset); + break; #endif case FETCH_NOP_SYMBOL: /* Ignore a place holder */ code++; @@ -184,7 +188,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, regs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); @@ -194,7 +198,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, fbuffer.regs = regs; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = entry_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -210,11 +214,24 @@ fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, } NOKPROBE_SYMBOL(fentry_trace_func); -/* Kretprobe handler */ +/* function exit handler */ +static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (tf->tp.entry_arg) + store_trace_entry_data(entry_data, &tf->tp, regs); + + return 0; +} +NOKPROBE_SYMBOL(trace_fprobe_entry_handler) + static nokprobe_inline void __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, unsigned long ret_ip, struct pt_regs *regs, - struct trace_event_file *trace_file) + void *entry_data, struct trace_event_file *trace_file) { struct fexit_trace_entry_head *entry; struct trace_event_buffer fbuffer; @@ -227,7 +244,7 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, regs, entry_data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); @@ -238,19 +255,19 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs, void *entry_data) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); + __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file); } NOKPROBE_SYMBOL(fexit_trace_func); @@ -269,7 +286,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, regs, NULL); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -280,7 +297,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, entry->ip = entry_ip; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -289,7 +306,8 @@ NOKPROBE_SYMBOL(fentry_perf_func); static void fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fexit_trace_entry_head *entry; @@ -301,7 +319,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, if (hlist_empty(head)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, regs, entry_data); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -312,7 +330,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -343,10 +361,10 @@ static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fexit_trace_func(tf, entry_ip, ret_ip, regs); + fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - fexit_perf_func(tf, entry_ip, ret_ip, regs); + fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data); #endif } NOKPROBE_SYMBOL(fexit_dispatcher); @@ -389,7 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->tpoint = tpoint; tf->fp.nr_maxactive = maxactive; - ret = trace_probe_init(&tf->tp, event, group, false); + ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) goto error; @@ -1109,6 +1127,11 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } + if (is_return && tf->tp.entry_arg) { + tf->fp.entry_handler = trace_fprobe_entry_handler; + tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + } + ret = traceprobe_set_print_fmt(&tf->tp, is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); if (ret < 0) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c4c6e0e0068b..14099cc17fc9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group, false); + ret = trace_probe_init(&tk->tp, event, group, false, nargs); if (ret < 0) goto error; @@ -740,6 +740,9 @@ static unsigned int number_of_same_symbols(char *func_name) return ctx.count; } +static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs); + static int __trace_kprobe_create(int argc, const char *argv[]) { /* @@ -948,6 +951,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (ret) goto error; /* This can be -ENOMEM */ } + /* entry handler for kretprobe */ + if (is_return && tk->tp.entry_arg) { + tk->rp.entry_handler = trace_kprobe_entry_handler; + tk->rp.data_size = traceprobe_get_entry_data_size(&tk->tp); + } ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tk->tp, ptype); @@ -1303,8 +1311,8 @@ static const struct file_operations kprobe_profile_ops = { /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; @@ -1329,6 +1337,9 @@ retry: case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); break; + case FETCH_OP_EDATA: + val = *(unsigned long *)((unsigned long)edata + code->offset); + break; #endif case FETCH_NOP_SYMBOL: /* Ignore a place holder */ code++; @@ -1359,7 +1370,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tk->tp.size + dsize); @@ -1368,7 +1379,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, fbuffer.regs = regs; entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -1384,6 +1395,31 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ + +static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct kretprobe *rp = get_kretprobe(ri); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return -ENOENT; + + tk = container_of(rp, struct trace_kprobe, rp); + + /* store argument values into ri->data as entry data */ + if (tk->tp.entry_arg) + store_trace_entry_data(ri->data, &tk->tp, regs); + + return 0; +} + + static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, @@ -1399,7 +1435,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, ri->data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tk->tp.size + dsize); @@ -1409,7 +1445,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, fbuffer.regs = regs; entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -1557,7 +1593,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, NULL); __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1568,7 +1604,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1593,7 +1629,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (hlist_empty(head)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, ri->data); __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1604,7 +1640,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 34289f9c6707..217169de0920 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -594,6 +594,8 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, return 0; } +static int __store_entry_arg(struct trace_probe *tp, int argnum); + static int parse_btf_arg(char *varname, struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) @@ -618,11 +620,7 @@ static int parse_btf_arg(char *varname, return -EOPNOTSUPP; } - if (ctx->flags & TPARG_FL_RETURN) { - if (strcmp(varname, "$retval") != 0) { - trace_probe_log_err(ctx->offset, NO_BTFARG); - return -ENOENT; - } + if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) { code->op = FETCH_OP_RETVAL; /* Check whether the function return type is not void */ if (query_btf_context(ctx) == 0) { @@ -654,11 +652,21 @@ static int parse_btf_arg(char *varname, const char *name = btf_name_by_offset(ctx->btf, params[i].name_off); if (name && !strcmp(name, varname)) { - code->op = FETCH_OP_ARG; - if (ctx->flags & TPARG_FL_TPOINT) - code->param = i + 1; - else - code->param = i; + if (tparg_is_function_entry(ctx->flags)) { + code->op = FETCH_OP_ARG; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; + } else if (tparg_is_function_return(ctx->flags)) { + code->op = FETCH_OP_EDATA; + ret = __store_entry_arg(ctx->tp, i); + if (ret < 0) { + /* internal error */ + return ret; + } + code->offset = ret; + } tid = params[i].type; goto found; } @@ -755,6 +763,110 @@ static int check_prepare_btf_string_fetch(char *typename, #endif +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + +static int __store_entry_arg(struct trace_probe *tp, int argnum) +{ + struct probe_entry_arg *earg = tp->entry_arg; + bool match = false; + int i, offset; + + if (!earg) { + earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); + if (!earg) + return -ENOMEM; + earg->size = 2 * tp->nr_args + 1; + earg->code = kcalloc(earg->size, sizeof(struct fetch_insn), + GFP_KERNEL); + if (!earg->code) { + kfree(earg); + return -ENOMEM; + } + /* Fill the code buffer with 'end' to simplify it */ + for (i = 0; i < earg->size; i++) + earg->code[i].op = FETCH_OP_END; + tp->entry_arg = earg; + } + + offset = 0; + for (i = 0; i < earg->size - 1; i++) { + switch (earg->code[i].op) { + case FETCH_OP_END: + earg->code[i].op = FETCH_OP_ARG; + earg->code[i].param = argnum; + earg->code[i + 1].op = FETCH_OP_ST_EDATA; + earg->code[i + 1].offset = offset; + return offset; + case FETCH_OP_ARG: + match = (earg->code[i].param == argnum); + break; + case FETCH_OP_ST_EDATA: + offset = earg->code[i].offset; + if (match) + return offset; + offset += sizeof(unsigned long); + break; + default: + break; + } + } + return -ENOSPC; +} + +int traceprobe_get_entry_data_size(struct trace_probe *tp) +{ + struct probe_entry_arg *earg = tp->entry_arg; + int i, size = 0; + + if (!earg) + return 0; + + for (i = 0; i < earg->size; i++) { + switch (earg->code[i].op) { + case FETCH_OP_END: + goto out; + case FETCH_OP_ST_EDATA: + size = earg->code[i].offset + sizeof(unsigned long); + break; + default: + break; + } + } +out: + return size; +} + +void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} +NOKPROBE_SYMBOL(store_trace_entry_data) +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) /* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */ @@ -830,7 +942,7 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API len = str_has_prefix(arg, "arg"); - if (len && tparg_is_function_entry(ctx->flags)) { + if (len) { ret = kstrtoul(arg + len, 10, ¶m); if (ret) goto inval; @@ -839,15 +951,29 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, err = TP_ERR_BAD_ARG_NUM; goto inval; } + param--; /* argN starts from 1, but internal arg[N] starts from 0 */ - code->op = FETCH_OP_ARG; - code->param = (unsigned int)param - 1; - /* - * The tracepoint probe will probe a stub function, and the - * first parameter of the stub is a dummy and should be ignored. - */ - if (ctx->flags & TPARG_FL_TPOINT) - code->param++; + if (tparg_is_function_entry(ctx->flags)) { + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (ctx->flags & TPARG_FL_TPOINT) + code->param++; + } else if (tparg_is_function_return(ctx->flags)) { + /* function entry argument access from return probe */ + ret = __store_entry_arg(ctx->tp, param); + if (ret < 0) /* This error should be an internal error */ + return ret; + + code->op = FETCH_OP_EDATA; + code->offset = ret; + } else { + err = TP_ERR_NOFENTRY_ARGS; + goto inval; + } return 0; } #endif @@ -1037,7 +1163,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; default: if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ - if (!tparg_is_function_entry(ctx->flags)) { + if (!tparg_is_function_entry(ctx->flags) && + !tparg_is_function_return(ctx->flags)) { trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EINVAL; } @@ -1090,67 +1217,45 @@ static int __parse_bitfield_probe_arg(const char *bf, return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } -/* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, - struct traceprobe_parse_context *ctx) +/* Split type part from @arg and return it. */ +static char *parse_probe_arg_type(char *arg, struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { - struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2, *t3; - int ret, len; - char *arg; - - arg = kstrdup(argv, GFP_KERNEL); - if (!arg) - return -ENOMEM; - - ret = -EINVAL; - len = strlen(arg); - if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(ctx->offset, ARG_TOO_LONG); - goto out; - } else if (len == 0) { - trace_probe_log_err(ctx->offset, NO_ARG_BODY); - goto out; - } - - ret = -ENOMEM; - parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) - goto out; + char *t = NULL, *t2, *t3; + int offs; - ret = -EINVAL; t = strchr(arg, ':'); if (t) { - *t = '\0'; - t2 = strchr(++t, '['); + *t++ = '\0'; + t2 = strchr(t, '['); if (t2) { *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - int offs = t2 + strlen(t2) - arg; + offs = t2 + strlen(t2) - arg; trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); - goto out; + return ERR_PTR(-EINVAL); } else if (t3[1] != '\0') { trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - goto out; + return ERR_PTR(-EINVAL); } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); - goto out; + return ERR_PTR(-EINVAL); } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); - goto out; + return ERR_PTR(-EINVAL); } } } + offs = t ? t - arg : 0; /* * Since $comm and immediate string can not be dereferenced, @@ -1161,74 +1266,52 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array type. */ if (parg->count || (t && strcmp(t, "string"))) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), - NEED_STRING_TYPE); - goto out; + trace_probe_log_err(ctx->offset + offs, NEED_STRING_TYPE); + return ERR_PTR(-EINVAL); } parg->type = find_fetch_type("string", ctx->flags); } else parg->type = find_fetch_type(t, ctx->flags); + if (!parg->type) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); - goto out; + trace_probe_log_err(ctx->offset + offs, BAD_TYPE); + return ERR_PTR(-EINVAL); } - code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); - if (!code) - goto out; - code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; - - ctx->last_type = NULL; - ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - ctx); - if (ret) - goto fail; - - /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && - ctx->last_type) { - if (!t) { - parg->type = find_fetch_type_from_btf_type(ctx); - } else if (strstr(t, "string")) { - ret = check_prepare_btf_string_fetch(t, &code, ctx); - if (ret) - goto fail; - } - } - parg->offset = *size; - *size += parg->type->size * (parg->count ?: 1); + return t; +} - if (parg->count) { - len = strlen(parg->type->fmttype) + 6; - parg->fmt = kmalloc(len, GFP_KERNEL); - if (!parg->fmt) { - ret = -ENOMEM; - goto out; - } - snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, - parg->count); - } +/* After parsing, adjust the fetch_insn according to the probe_arg */ +static int finalize_fetch_insn(struct fetch_insn *code, + struct probe_arg *parg, + char *type, + int type_offset, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *scode; + int ret; - ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { + /* Check bad combination of the type and the last fetch_insn. */ if (!strcmp(parg->type->name, "symstr")) { if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + type_offset, BAD_SYMSTRING); - goto fail; + return -EINVAL; } } else { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + type_offset, BAD_STRING); - goto fail; + return -EINVAL; } } + if (!strcmp(parg->type->name, "symstr") || (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || @@ -1244,9 +1327,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -EINVAL; } } + /* If op == DEREF, replace it with STRING */ if (!strcmp(parg->type->name, "ustring") || code->op == FETCH_OP_UDEREF) @@ -1267,47 +1351,134 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -E2BIG; } code->op = FETCH_OP_ST_RAW; code->size = parg->type->size; } + + /* Save storing fetch_insn. */ scode = code; + /* Modify operation */ - if (t != NULL) { - ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (type != NULL) { + /* Bitfield needs a special fetch_insn. */ + ret = __parse_bitfield_probe_arg(type, parg->type, &code); if (ret) { - trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); - goto fail; + trace_probe_log_err(ctx->offset + type_offset, BAD_BITFIELD); + return ret; } } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && ctx->last_type) { + /* If user not specified the type, try parsing BTF bitfield. */ ret = parse_btf_bitfield(&code, ctx); if (ret) - goto fail; + return ret; } - ret = -EINVAL; + /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), - BAD_STRING); - goto fail; + trace_probe_log_err(ctx->offset + type_offset, BAD_STRING); + return -EINVAL; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -E2BIG; } code->op = FETCH_OP_LP_ARRAY; code->param = parg->count; } + + /* Finalize the fetch_insn array. */ code++; code->op = FETCH_OP_END; - ret = 0; + return 0; +} + +/* String length checking wrapper */ +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code, *tmp = NULL; + char *type, *arg; + int ret, len; + + len = strlen(argv); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); + return -E2BIG; + } else if (len == 0) { + trace_probe_log_err(ctx->offset, NO_ARG_BODY); + return -EINVAL; + } + + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + ret = -ENOMEM; + goto out; + } + + type = parse_probe_arg_type(arg, parg, ctx); + if (IS_ERR(type)) { + ret = PTR_ERR(type); + goto out; + } + + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); + if (!code) { + ret = -ENOMEM; + goto out; + } + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ctx->last_type = NULL; + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + ctx); + if (ret < 0) + goto fail; + + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + if (!type) { + parg->type = find_fetch_type_from_btf_type(ctx); + } else if (strstr(type, "string")) { + ret = check_prepare_btf_string_fetch(type, &code, ctx); + if (ret) + goto fail; + } + } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) { + ret = -ENOMEM; + goto out; + } + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + ret = finalize_fetch_insn(code, parg, type, type ? type - arg : 0, ctx); + if (ret < 0) + goto fail; + + for (; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_OP_END) + break; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -1316,7 +1487,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); fail: - if (ret) { + if (ret < 0) { for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) if (code->op == FETCH_NOP_SYMBOL || code->op == FETCH_OP_DATA) @@ -1379,9 +1550,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, struct probe_arg *parg = &tp->args[i]; const char *body; - /* Increment count for freeing args in error case */ - tp->nr_args++; - + ctx->tp = tp; body = strchr(arg, '='); if (body) { if (body - arg > MAX_ARG_NAME_LEN) { @@ -1438,7 +1607,8 @@ static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, if (str_has_prefix(argv[i], "$arg")) { trace_probe_log_set_index(i + 2); - if (!tparg_is_function_entry(ctx->flags)) { + if (!tparg_is_function_entry(ctx->flags) && + !tparg_is_function_return(ctx->flags)) { trace_probe_log_err(0, NOFENTRY_ARGS); return -EINVAL; } @@ -1761,12 +1931,18 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); + if (tp->entry_arg) { + kfree(tp->entry_arg->code); + kfree(tp->entry_arg); + tp->entry_arg = NULL; + } + if (tp->event) trace_probe_unlink(tp); } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, bool alloc_filter) + const char *group, bool alloc_filter, int nargs) { struct trace_event_call *call; size_t size = sizeof(struct trace_probe_event); @@ -1802,6 +1978,11 @@ int trace_probe_init(struct trace_probe *tp, const char *event, goto error; } + tp->nr_args = nargs; + /* Make sure pointers in args[] are NULL */ + if (nargs) + memset(tp->args, 0, sizeof(tp->args[0]) * nargs); + return 0; error: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index c1877d018269..cef3a50628a3 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -92,6 +92,7 @@ enum fetch_op { FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ FETCH_OP_DATA, /* Allocated data: .data */ + FETCH_OP_EDATA, /* Entry data: .offset */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ FETCH_OP_UDEREF, /* User-space Dereference: .offset */ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ + FETCH_OP_ST_EDATA, /* Store Entry Data: .offset */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -232,6 +234,11 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct probe_entry_arg { + struct fetch_insn *code; + unsigned int size; /* The entry data size */ +}; + struct trace_uprobe_filter { rwlock_t rwlock; int nr_systemwide; @@ -253,6 +260,7 @@ struct trace_probe { struct trace_probe_event *event; ssize_t size; /* trace entry size */ unsigned int nr_args; + struct probe_entry_arg *entry_arg; /* This is only for return probe */ struct probe_arg args[]; }; @@ -338,7 +346,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, bool alloc_filter); + const char *group, bool alloc_filter, int nargs); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); @@ -355,6 +363,18 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, u8 *data, void *field); +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +int traceprobe_get_entry_data_size(struct trace_probe *tp); +/* This is a runtime function to store entry data */ +void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs); +#else /* !CONFIG_HAVE_FUNCTION_ARG_ACCESS_API */ +static inline int traceprobe_get_entry_data_size(struct trace_probe *tp) +{ + return 0; +} +#define store_trace_entry_data(edata, tp, regs) do { } while (0) +#endif + #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) #define trace_probe_for_each_link_rcu(pos, tp) \ @@ -381,6 +401,11 @@ static inline bool tparg_is_function_entry(unsigned int flags) return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); } +static inline bool tparg_is_function_return(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN); +} + struct traceprobe_parse_context { struct trace_event_call *event; /* BTF related parameters */ @@ -392,6 +417,7 @@ struct traceprobe_parse_context { const struct btf_type *last_type; /* Saved type */ u32 last_bitoffs; /* Saved bitoffs */ u32 last_bitsize; /* Saved bitsize */ + struct trace_probe *tp; unsigned int flags; int offset; }; @@ -506,7 +532,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTFARG, "This variable is not found at this probe point"),\ C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ - C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry or exit"), \ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \ C(ARGIDX_2BIG, "$argN index is too big"), \ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 3935b347f874..2caf0d2afb32 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -232,7 +232,7 @@ array: /* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata) { struct probe_arg *arg; int i, len, ret = 0; @@ -240,7 +240,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; if (unlikely(arg->dynamic)) { - len = process_fetch_insn(arg->code, regs, NULL, NULL); + len = process_fetch_insn(arg->code, regs, edata, NULL, NULL); if (len > 0) ret += len; } @@ -251,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, void *rec, +store_trace_args(void *data, struct trace_probe *tp, void *rec, void *edata, int header_size, int maxlen) { struct probe_arg *arg; @@ -266,7 +266,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, rec, dl, base); + ret = process_fetch_insn(arg->code, rec, edata, dl, base); if (arg->dynamic && likely(ret > 0)) { dyndata += ret; maxlen -= ret; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c9ffdcfe622e..8a407adb0e1c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> +#include <linux/kmemleak.h> #include <linux/ftrace.h> #include <trace/events/sched.h> @@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } + +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ +static int *tgid_map; + +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char saved_cmdlines[]; +}; +static struct saved_cmdlines_buffer *savedcmd; + +/* Holds the size of a cmdline and pid element */ +#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ + (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + s->cmdline_num = val; + + /* Place map_cmdline_to_pid array right after saved_cmdlines */ + s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; + + s->cmdline_idx = 0; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return s; +} + +int trace_create_savedcmd(void) +{ + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); + + return savedcmd ? 0 : -ENOMEM; +} + +int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned tpid, idx; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. + */ + lockdep_assert_preemption_disabled(); + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tpid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + savedcmd->map_pid_to_cmdline[tpid] = idx; + savedcmd->cmdline_idx = idx; + } + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + int tpid; + + if (!pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } + } + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + +int trace_find_tgid(int pid) +{ + int *ptr = trace_find_tgid_ptr(pid); + + return ptr ? *ptr : 0; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + int *ptr; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) + return 0; + + *ptr = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +int trace_alloc_tgid_map(void) +{ + int *map; + + if (tgid_map) + return 0; + + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + if (!map) + return -ENOMEM; + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + return 0; +} + +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int pid = ++(*pos); + + return trace_find_tgid_ptr(pid); +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + int pid = *pos; + + return trace_find_tgid_ptr(pid); +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; + + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +void trace_free_saved_cmdlines_buffer(void) +{ + free_saved_cmdlines_buffer(savedcmd); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = allocate_cmdlines_buffer(val); + if (!s) + return -ENOMEM; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 529590499b1f..e9c5058a8efd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -768,7 +768,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) { + if (ftrace_dump_on_oops_enabled()) { ftrace_dump(DUMP_ALL); /* ftrace_dump() disables tracing */ tracing_on(); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a84b85d8aac1..9e461362450a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -211,8 +211,8 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; @@ -337,7 +337,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group, true); + ret = trace_probe_init(&tu->tp, event, group, true, nargs); if (ret < 0) goto error; @@ -1490,11 +1490,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs); + dsize = __get_data_size(&tu->tp, regs, NULL); esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) ret |= uprobe_trace_func(tu, regs, ucb, dsize); @@ -1525,11 +1525,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs); + dsize = __get_data_size(&tu->tp, regs, NULL); esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) uretprobe_trace_func(tu, func, regs, ucb, dsize); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce4d99df5f0e..0b0b95418b16 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; - char *kbuf = NULL, *pos, *next_line; + char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c new file mode 100644 index 000000000000..f95516cd45bb --- /dev/null +++ b/kernel/vmcore_info.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + */ + +#include <linux/buildid.h> +#include <linux/init.h> +#include <linux/utsname.h> +#include <linux/vmalloc.h> +#include <linux/sizes.h> +#include <linux/kexec.h> +#include <linux/memory.h> +#include <linux/cpuhotplug.h> +#include <linux/memblock.h> +#include <linux/kmemleak.h> + +#include <asm/page.h> +#include <asm/sections.h> + +#include <crypto/sha1.h> + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* vmcoreinfo stuff */ +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; +u32 *vmcoreinfo_note; + +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + +void crash_save_vmcoreinfo(void) +{ + if (!vmcoreinfo_note) + return; + + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; + + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, + "vmcoreinfo data exceeds allocated size, truncating"); +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa(vmcoreinfo_note); +} +EXPORT_SYMBOL(paddr_vmcoreinfo_note); + +static int __init crash_save_vmcoreinfo_init(void) +{ + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_BUILD_ID(); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_OFFSET(uts_namespace, name); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); + +#ifndef CONFIG_NUMA + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP + VMCOREINFO_SYMBOL_ARRAY(vmemmap); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL_ARRAY(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLATMEM + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#endif + +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 81a8862295d6..d7b2125503af 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -796,8 +796,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, /* * /proc/sys/kernel/watchdog */ -int proc_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | WATCHDOG_SOFTOCKUP_ENABLED, @@ -807,8 +807,8 @@ int proc_watchdog(struct ctl_table *table, int write, /* * /proc/sys/kernel/nmi_watchdog */ -int proc_nmi_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nmi_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { if (!watchdog_hardlockup_available && write) return -ENOTSUPP; @@ -816,21 +816,23 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, table, write, buffer, lenp, ppos); } +#ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * /proc/sys/kernel/soft_watchdog */ -int proc_soft_watchdog(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_soft_watchdog(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } +#endif /* * /proc/sys/kernel/watchdog_thresh */ -int proc_watchdog_thresh(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog_thresh(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -852,8 +854,8 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ -int proc_watchdog_cpumask(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_watchdog_cpumask(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fbb0ec39079..f397510edc9b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7080,7 +7080,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR_NULL, }; -static struct bus_type wq_subsys = { +static const struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; |
