summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2024-03-25 11:32:29 +0100
committerIngo Molnar <mingo@kernel.org>2024-03-25 11:32:29 +0100
commitf4566a1e73957800df75a3dd2dccee8a4697f327 (patch)
treeb043b875228c0b25988af66c680d60cae69d761d /kernel
parentsched/fair: Fix typos in comments (diff)
parentLinux 6.9-rc1 (diff)
downloadlinux-f4566a1e73957800df75a3dd2dccee8a4697f327.tar.gz
linux-f4566a1e73957800df75a3dd2dccee8a4697f327.zip
Merge tag 'v6.9-rc1' into sched/core, to pick up fixes and to refresh the branch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec12
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Kconfig1
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arena.c558
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bpf_iter.c4
-rw-r--r--kernel/bpf/bpf_local_storage.c52
-rw-r--r--kernel/bpf/bpf_lsm.c21
-rw-r--r--kernel/bpf/bpf_struct_ops.c737
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h12
-rw-r--r--kernel/bpf/btf.c566
-rw-r--r--kernel/bpf/cgroup.c11
-rw-r--r--kernel/bpf/core.c46
-rw-r--r--kernel/bpf/cpumap.c7
-rw-r--r--kernel/bpf/cpumask.c4
-rw-r--r--kernel/bpf/devmap.c11
-rw-r--r--kernel/bpf/disasm.c14
-rw-r--r--kernel/bpf/hashtab.c14
-rw-r--r--kernel/bpf/helpers.c23
-rw-r--r--kernel/bpf/inode.c276
-rw-r--r--kernel/bpf/log.c65
-rw-r--r--kernel/bpf/lpm_trie.c20
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/stackmap.c9
-rw-r--r--kernel/bpf/syscall.c298
-rw-r--r--kernel/bpf/token.c278
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c744
-rw-r--r--kernel/cgroup/rstat.c4
-rw-r--r--kernel/configs/debug.config6
-rw-r--r--kernel/configs/hardening.config7
-rw-r--r--kernel/crash_core.c764
-rw-r--r--kernel/crash_reserve.c464
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/direct.c9
-rw-r--r--kernel/dma/swiotlb.c91
-rw-r--r--kernel/elfcorehdr.c (renamed from kernel/crash_dump.c)0
-rw-r--r--kernel/entry/common.c8
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/kallsyms_selftest.c1
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kexec_core.c294
-rw-r--r--kernel/kexec_file.c15
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/module/Kconfig3
-rw-r--r--kernel/module/internal.h6
-rw-r--r--kernel/module/main.c29
-rw-r--r--kernel/module/strict_rwx.c63
-rw-r--r--kernel/padata.c14
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/Kconfig26
-rw-r--r--kernel/power/energy_model.c484
-rw-r--r--kernel/power/hibernate.c107
-rw-r--r--kernel/power/main.c182
-rw-r--r--kernel/power/power.h23
-rw-r--r--kernel/power/snapshot.c25
-rw-r--r--kernel/power/suspend.c9
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c197
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/nbcon.c41
-rw-r--r--kernel/printk/printk.c137
-rw-r--r--kernel/printk/printk_ringbuffer.c335
-rw-r--r--kernel/printk/printk_ringbuffer.h54
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/sched/membarrier.c13
-rw-r--r--kernel/signal.c28
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/timer.c12
-rw-r--r--kernel/time/timer_migration.c31
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/ftrace.c90
-rw-r--r--kernel/trace/ring_buffer.c177
-rw-r--r--kernel/trace/trace.c816
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_benchmark.c5
-rw-r--r--kernel/trace/trace_eprobe.c8
-rw-r--r--kernel/trace/trace_events_trigger.c63
-rw-r--r--kernel/trace/trace_events_user.c209
-rw-r--r--kernel/trace/trace_fprobe.c59
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_probe.c417
-rw-r--r--kernel/trace/trace_probe.h30
-rw-r--r--kernel/trace/trace_probe_tmpl.h10
-rw-r--r--kernel/trace/trace_sched_switch.c515
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_uprobe.c14
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/vmcore_info.c233
-rw-r--r--kernel/watchdog.c22
-rw-r--r--kernel/workqueue.c2
103 files changed, 7136 insertions, 3050 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 946dffa048b7..6c34e63c88ff 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -2,11 +2,13 @@
menu "Kexec and crash features"
-config CRASH_CORE
+config CRASH_RESERVE
+ bool
+
+config VMCORE_INFO
bool
config KEXEC_CORE
- select CRASH_CORE
bool
config KEXEC_ELF
@@ -95,9 +97,11 @@ config KEXEC_JUMP
config CRASH_DUMP
bool "kernel crash dumps"
+ default y
depends on ARCH_SUPPORTS_CRASH_DUMP
- select CRASH_CORE
- select KEXEC_CORE
+ depends on KEXEC_CORE
+ select VMCORE_INFO
+ select CRASH_RESERVE
help
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
diff --git a/kernel/Makefile b/kernel/Makefile
index ce105a5558fc..3c13240dfc9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,8 +68,10 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
@@ -120,7 +122,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 9c8e5f732c4c..e7a62ebbf4d1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1693,9 +1693,7 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- audit_buffer_cache = kmem_cache_create("audit_buffer",
- sizeof(struct audit_buffer),
- 0, SLAB_PANIC, NULL);
+ audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8317a37dea0b..be8c680121e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -788,7 +788,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
static inline int audit_dupe_lsm_field(struct audit_field *df,
struct audit_field *sf)
{
- int ret = 0;
+ int ret;
char *lsm_str;
/* our own copy of lsm_str */
diff --git a/kernel/bounds.c b/kernel/bounds.c
index b529182e8b04..c5a9fcd2d622 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 6a906ff93006..bc25f5098a25 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -3,6 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
+ select CRYPTO_LIB_SHA1
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..368c5d86b5b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
@@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o
+endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644
index 000000000000..86571e760dd6
--- /dev/null
+++ b/kernel/bpf/arena.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000 // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ * mov eax, eax // eax has lower 32-bit of user pointer
+ * mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
+#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
+
+struct bpf_arena {
+ struct bpf_map map;
+ u64 user_vm_start;
+ u64 user_vm_end;
+ struct vm_struct *kern_vm;
+ struct maple_tree mt;
+ struct list_head vma_list;
+ struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+ return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+ struct vm_struct *kern_vm;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_arena *arena;
+ u64 vm_range;
+ int err = -ENOMEM;
+
+ if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+ /* BPF_F_MMAPABLE must be set */
+ !(attr->map_flags & BPF_F_MMAPABLE) ||
+ /* No unsupported flags present */
+ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->map_extra & ~PAGE_MASK)
+ /* If non-zero the map_extra is an expected user VMA start address */
+ return ERR_PTR(-EINVAL);
+
+ vm_range = (u64)attr->max_entries * PAGE_SIZE;
+ if (vm_range > (1ull << 32))
+ return ERR_PTR(-E2BIG);
+
+ if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+ /* user vma must not cross 32-bit boundary */
+ return ERR_PTR(-ERANGE);
+
+ kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+ if (!kern_vm)
+ return ERR_PTR(-ENOMEM);
+
+ arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+ if (!arena)
+ goto err;
+
+ arena->kern_vm = kern_vm;
+ arena->user_vm_start = attr->map_extra;
+ if (arena->user_vm_start)
+ arena->user_vm_end = arena->user_vm_start + vm_range;
+
+ INIT_LIST_HEAD(&arena->vma_list);
+ bpf_map_init_from_attr(&arena->map, attr);
+ mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
+ mutex_init(&arena->lock);
+
+ return &arena->map;
+err:
+ free_vm_area(kern_vm);
+ return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t pte;
+
+ pte = ptep_get(ptep);
+ if (!pte_present(pte)) /* sanity check */
+ return 0;
+ page = pte_page(pte);
+ /*
+ * We do not update pte here:
+ * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+ * 2. TLB flushing is batched or deferred. Even if we clear pte,
+ * the TLB entries can stick around and continue to permit access to
+ * the freed page. So it all relies on 1.
+ */
+ __free_page(page);
+ return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ /*
+ * Check that user vma-s are not around when bpf map is freed.
+ * mmap() holds vm_file which holds bpf_map refcnt.
+ * munmap() must have happened on vma followed by arena_vm_close()
+ * which would clear arena->vma_list.
+ */
+ if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+ return;
+
+ /*
+ * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+ * It unmaps everything from vmalloc area and clears pgtables.
+ * Call apply_to_existing_page_range() first to find populated ptes and
+ * free those pages.
+ */
+ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ free_vm_area(arena->kern_vm);
+ mtree_destroy(&arena->mt);
+ bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+ return 0;
+}
+
+struct vma_list {
+ struct vm_area_struct *vma;
+ struct list_head head;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+ struct vma_list *vml;
+
+ vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+ if (!vml)
+ return -ENOMEM;
+ vma->vm_private_data = vml;
+ vml->vma = vma;
+ list_add(&vml->head, &arena->vma_list);
+ return 0;
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct vma_list *vml;
+
+ guard(mutex)(&arena->lock);
+ vml = vma->vm_private_data;
+ list_del(&vml->head);
+ vma->vm_private_data = NULL;
+ kfree(vml);
+}
+
+#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+ struct bpf_map *map = vmf->vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct page *page;
+ long kbase, kaddr;
+ int ret;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+ kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
+
+ guard(mutex)(&arena->lock);
+ page = vmalloc_to_page((void *)kaddr);
+ if (page)
+ /* already have a page vmap-ed */
+ goto out;
+
+ if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+ /* User space requested to segfault when page is not allocated by bpf prog */
+ return VM_FAULT_SIGSEGV;
+
+ ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
+ if (ret)
+ return VM_FAULT_SIGSEGV;
+
+ /* Account into memcg of the process that created bpf_arena */
+ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ if (ret) {
+ mtree_erase(&arena->mt, vmf->pgoff);
+ return VM_FAULT_SIGSEGV;
+ }
+
+ ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ if (ret) {
+ mtree_erase(&arena->mt, vmf->pgoff);
+ __free_page(page);
+ return VM_FAULT_SIGSEGV;
+ }
+out:
+ page_ref_add(page, 1);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+ .close = arena_vm_close,
+ .fault = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ long ret;
+
+ if (pgoff)
+ return -EINVAL;
+ if (len > (1ull << 32))
+ return -E2BIG;
+
+ /* if user_vm_start was specified at arena creation time */
+ if (arena->user_vm_start) {
+ if (len > arena->user_vm_end - arena->user_vm_start)
+ return -E2BIG;
+ if (len != arena->user_vm_end - arena->user_vm_start)
+ return -EINVAL;
+ if (addr != arena->user_vm_start)
+ return -EINVAL;
+ }
+
+ ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+ if (IS_ERR_VALUE(ret))
+ return ret;
+ if ((ret >> 32) == ((ret + len - 1) >> 32))
+ return ret;
+ if (WARN_ON_ONCE(arena->user_vm_start))
+ /* checks at map creation time should prevent this */
+ return -EFAULT;
+ return round_up(ret, 1ull << 32);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ guard(mutex)(&arena->lock);
+ if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+ /*
+ * If map_extra was not specified at arena creation time then
+ * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+ * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+ * or
+ * specify addr in map_extra and
+ * use the same addr later with mmap(addr, MAP_FIXED..);
+ */
+ return -EBUSY;
+
+ if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+ /* all user processes must have the same size of mmap-ed region */
+ return -EBUSY;
+
+ /* Earlier checks should prevent this */
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
+ return -EFAULT;
+
+ if (remember_vma(arena, vma))
+ return -ENOMEM;
+
+ arena->user_vm_start = vma->vm_start;
+ arena->user_vm_end = vma->vm_end;
+ /*
+ * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+ * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+ * potential change of user_vm_start.
+ */
+ vm_flags_set(vma, VM_DONTEXPAND);
+ vma->vm_ops = &arena_vm_ops;
+ return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+ return -ERANGE;
+ *imm = (unsigned long)arena->user_vm_start;
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = arena_map_alloc,
+ .map_free = arena_map_free,
+ .map_direct_value_addr = arena_map_direct_value_addr,
+ .map_mmap = arena_map_mmap,
+ .map_get_unmapped_area = arena_get_unmapped_area,
+ .map_get_next_key = arena_map_get_next_key,
+ .map_push_elem = arena_map_push_elem,
+ .map_peek_elem = arena_map_peek_elem,
+ .map_pop_elem = arena_map_pop_elem,
+ .map_lookup_elem = arena_map_lookup_elem,
+ .map_update_elem = arena_map_update_elem,
+ .map_delete_elem = arena_map_delete_elem,
+ .map_check_btf = arena_map_check_btf,
+ .map_mem_usage = arena_map_mem_usage,
+ .map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+ return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+ /* user_vm_end/start are fixed before bpf prog runs */
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+ struct page **pages;
+ long pgoff = 0;
+ u32 uaddr32;
+ int ret, i;
+
+ if (page_cnt > page_cnt_max)
+ return 0;
+
+ if (uaddr) {
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff + page_cnt > page_cnt_max)
+ /* requested address will be outside of user VMA */
+ return 0;
+ }
+
+ /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ guard(mutex)(&arena->lock);
+
+ if (uaddr)
+ ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
+ MT_ENTRY, GFP_KERNEL);
+ else
+ ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
+ page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
+ if (ret)
+ goto out_free_pages;
+
+ ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
+ node_id, page_cnt, pages);
+ if (ret)
+ goto out;
+
+ uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+ /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
+ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+ kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+ if (ret) {
+ for (i = 0; i < page_cnt; i++)
+ __free_page(pages[i]);
+ goto out;
+ }
+ kvfree(pages);
+ return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+ mtree_erase(&arena->mt, pgoff);
+out_free_pages:
+ kvfree(pages);
+ return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ struct vma_list *vml;
+
+ list_for_each_entry(vml, &arena->vma_list, head)
+ zap_page_range_single(vml->vma, uaddr,
+ PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ u64 full_uaddr, uaddr_end;
+ long kaddr, pgoff, i;
+ struct page *page;
+
+ /* only aligned lower 32-bit are relevant */
+ uaddr = (u32)uaddr;
+ uaddr &= PAGE_MASK;
+ full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+ uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+ if (full_uaddr >= uaddr_end)
+ return;
+
+ page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+ guard(mutex)(&arena->lock);
+
+ pgoff = compute_pgoff(arena, uaddr);
+ /* clear range */
+ mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
+
+ if (page_cnt > 1)
+ /* bulk zap if multiple pages being freed */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+ for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+ page = vmalloc_to_page((void *)kaddr);
+ if (!page)
+ continue;
+ if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ zap_pages(arena, full_uaddr, 1);
+ vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+ __free_page(page);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0bdbbbeab155..13358675ff2e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
u64 array_size, mask64;
struct bpf_array *array;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0fae79164187..112581cf97e7 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -548,7 +548,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -ENOENT;
/* Only allow sleepable program for resched-able iterator */
- if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
return -EINVAL;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
@@ -697,7 +697,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
- if (prog->aux->sleepable) {
+ if (prog->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146824cc9689..bdea1a459153 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -414,47 +414,21 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
bpf_selem_unlink_storage(selem, reuse_now);
}
-/* If cacheit_lockit is false, this lookup function is lockless */
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *smap,
- bool cacheit_lockit)
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_local_storage_data *sdata;
- struct bpf_local_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
- bpf_rcu_lock_held());
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
- rcu_read_lock_trace_held())
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- unsigned long flags;
-
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next bpf_local_storage_lookup().
- */
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- if (selem_linked_to_storage(selem))
- rcu_assign_pointer(local_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- }
+ unsigned long flags;
- return sdata;
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e8e910395bf6..68240c3c6e7d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,9 +260,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -276,10 +282,6 @@ BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
BTF_ID(func, bpf_lsm_file_receive)
-#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_inet_conn_established)
-#endif /* CONFIG_SECURITY_NETWORK */
-
BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
@@ -330,6 +332,8 @@ BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
@@ -357,9 +361,8 @@ BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
#ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 02068bd0e4d9..43356faaa057 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -13,26 +13,17 @@
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
-enum bpf_struct_ops_state {
- BPF_STRUCT_OPS_STATE_INIT,
- BPF_STRUCT_OPS_STATE_INUSE,
- BPF_STRUCT_OPS_STATE_TOBEFREE,
- BPF_STRUCT_OPS_STATE_READY,
-};
-
-#define BPF_STRUCT_OPS_COMMON_VALUE \
- refcount_t refcnt; \
- enum bpf_struct_ops_state state
-
struct bpf_struct_ops_value {
- BPF_STRUCT_OPS_COMMON_VALUE;
+ struct bpf_struct_ops_common_value common;
char data[] ____cacheline_aligned_in_smp;
};
+#define MAX_TRAMP_IMAGE_PAGES 8
+
struct bpf_struct_ops_map {
struct bpf_map map;
struct rcu_head rcu;
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
/* link has all the bpf_links that is populated
@@ -40,12 +31,14 @@ struct bpf_struct_ops_map {
* (in kvalue.data).
*/
struct bpf_link **links;
- /* image is a page that has all the trampolines
+ u32 links_cnt;
+ u32 image_pages_cnt;
+ /* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
- * A PAGE_SIZE "image" is enough to store all trampoline for
- * "links[]".
*/
- void *image;
+ void *image_pages[MAX_TRAMP_IMAGE_PAGES];
+ /* The owner moduler's btf. */
+ struct btf *btf;
/* uvalue->data stores the kernel struct
* (e.g. tcp_congestion_ops) that is more useful
* to userspace than the kvalue. For example,
@@ -70,35 +63,6 @@ static DEFINE_MUTEX(update_mutex);
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
-/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
- * the map's value exposed to the userspace and its btf-type-id is
- * stored at the map->btf_vmlinux_value_type_id.
- *
- */
-#define BPF_STRUCT_OPS_TYPE(_name) \
-extern struct bpf_struct_ops bpf_##_name; \
- \
-struct bpf_struct_ops_##_name { \
- BPF_STRUCT_OPS_COMMON_VALUE; \
- struct _name data ____cacheline_aligned_in_smp; \
-};
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-enum {
-#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
- __NR_BPF_STRUCT_OPS_TYPE,
-};
-
-static struct bpf_struct_ops * const bpf_struct_ops[] = {
-#define BPF_STRUCT_OPS_TYPE(_name) \
- [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-};
-
const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
@@ -108,138 +72,355 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
#endif
};
-static const struct btf_type *module_type;
+BTF_ID_LIST(st_ops_ids)
+BTF_ID(struct, module)
+BTF_ID(struct, bpf_struct_ops_common_value)
+
+enum {
+ IDX_MODULE_ID,
+ IDX_ST_OPS_COMMON_VALUE_ID,
+};
+
+extern struct btf *btf_vmlinux;
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+static bool is_valid_value_type(struct btf *btf, s32 value_id,
+ const struct btf_type *type,
+ const char *value_name)
{
- s32 type_id, value_id, module_id;
+ const struct btf_type *common_value_type;
const struct btf_member *member;
- struct bpf_struct_ops *st_ops;
- const struct btf_type *t;
- char value_name[128];
- const char *mname;
- u32 i, j;
+ const struct btf_type *vt, *mt;
- /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
+ vt = btf_type_by_id(btf, value_id);
+ if (btf_vlen(vt) != 2) {
+ pr_warn("The number of %s's members should be 2, but we get %d\n",
+ value_name, btf_vlen(vt));
+ return false;
+ }
+ member = btf_type_member(vt);
+ mt = btf_type_by_id(btf, member->type);
+ common_value_type = btf_type_by_id(btf_vmlinux,
+ st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]);
+ if (mt != common_value_type) {
+ pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
+ value_name);
+ return false;
+ }
+ member++;
+ mt = btf_type_by_id(btf, member->type);
+ if (mt != type) {
+ pr_warn("The second member of %s should be %s\n",
+ value_name, btf_name_by_offset(btf, type->name_off));
+ return false;
+ }
- module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
- if (module_id < 0) {
- pr_warn("Cannot find struct module in btf_vmlinux\n");
- return;
+ return true;
+}
+
+static void *bpf_struct_ops_image_alloc(void)
+{
+ void *image;
+ int err;
+
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (err)
+ return ERR_PTR(err);
+ image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!image) {
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ return ERR_PTR(-ENOMEM);
}
- module_type = btf_type_by_id(btf, module_id);
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- st_ops = bpf_struct_ops[i];
+ return image;
+}
- if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
- sizeof(value_name)) {
- pr_warn("struct_ops name %s is too long\n",
- st_ops->name);
- continue;
- }
- sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+void bpf_struct_ops_image_free(void *image)
+{
+ if (image) {
+ arch_free_bpf_trampoline(image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
+}
+
+#define MAYBE_NULL_SUFFIX "__nullable"
+#define MAX_STUB_NAME 128
- value_id = btf_find_by_name_kind(btf, value_name,
- BTF_KIND_STRUCT);
- if (value_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- value_name);
+/* Return the type info of a stub function, if it exists.
+ *
+ * The name of a stub function is made up of the name of the struct_ops and
+ * the name of the function pointer member, separated by "__". For example,
+ * if the struct_ops type is named "foo_ops" and the function pointer
+ * member is named "bar", the stub function name would be "foo_ops__bar".
+ */
+static const struct btf_type *
+find_stub_func_proto(const struct btf *btf, const char *st_op_name,
+ const char *member_name)
+{
+ char stub_func_name[MAX_STUB_NAME];
+ const struct btf_type *func_type;
+ s32 btf_id;
+ int cp;
+
+ cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
+ st_op_name, member_name);
+ if (cp >= MAX_STUB_NAME) {
+ pr_warn("Stub function name too long\n");
+ return NULL;
+ }
+ btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
+ if (btf_id < 0)
+ return NULL;
+ func_type = btf_type_by_id(btf, btf_id);
+ if (!func_type)
+ return NULL;
+
+ return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
+}
+
+/* Prepare argument info for every nullable argument of a member of a
+ * struct_ops type.
+ *
+ * Initialize a struct bpf_struct_ops_arg_info according to type info of
+ * the arguments of a stub function. (Check kCFI for more information about
+ * stub functions.)
+ *
+ * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info
+ * to provide an array of struct bpf_ctx_arg_aux, which in turn provides
+ * the information that used by the verifier to check the arguments of the
+ * BPF struct_ops program assigned to the member. Here, we only care about
+ * the arguments that are marked as __nullable.
+ *
+ * The array of struct bpf_ctx_arg_aux is eventually assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the
+ * verifier. (See check_struct_ops_btf_id())
+ *
+ * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If
+ * fails, it will be kept untouched.
+ */
+static int prepare_arg_info(struct btf *btf,
+ const char *st_ops_name,
+ const char *member_name,
+ const struct btf_type *func_proto,
+ struct bpf_struct_ops_arg_info *arg_info)
+{
+ const struct btf_type *stub_func_proto, *pointed_type;
+ const struct btf_param *stub_args, *args;
+ struct bpf_ctx_arg_aux *info, *info_buf;
+ u32 nargs, arg_no, info_cnt = 0;
+ u32 arg_btf_id;
+ int offset;
+
+ stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
+ if (!stub_func_proto)
+ return 0;
+
+ /* Check if the number of arguments of the stub function is the same
+ * as the number of arguments of the function pointer.
+ */
+ nargs = btf_type_vlen(func_proto);
+ if (nargs != btf_type_vlen(stub_func_proto)) {
+ pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
+ st_ops_name, member_name, member_name, st_ops_name);
+ return -EINVAL;
+ }
+
+ if (!nargs)
+ return 0;
+
+ args = btf_params(func_proto);
+ stub_args = btf_params(stub_func_proto);
+
+ info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /* Prepare info for every nullable argument */
+ info = info_buf;
+ for (arg_no = 0; arg_no < nargs; arg_no++) {
+ /* Skip arguments that is not suffixed with
+ * "__nullable".
+ */
+ if (!btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX))
continue;
+
+ /* Should be a pointer to struct */
+ pointed_type = btf_type_resolve_ptr(btf,
+ args[arg_no].type,
+ &arg_btf_id);
+ if (!pointed_type ||
+ !btf_type_is_struct(pointed_type)) {
+ pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
+ st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+ goto err_out;
}
- type_id = btf_find_by_name_kind(btf, st_ops->name,
- BTF_KIND_STRUCT);
- if (type_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- st_ops->name);
- continue;
+ offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
+ if (offset < 0) {
+ pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
+ st_ops_name, member_name, arg_no);
+ goto err_out;
}
- t = btf_type_by_id(btf, type_id);
- if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
- pr_warn("Cannot support #%u members in struct %s\n",
- btf_type_vlen(t), st_ops->name);
- continue;
+
+ if (args[arg_no].type != stub_args[arg_no].type) {
+ pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
+ arg_no, st_ops_name, member_name);
+ goto err_out;
}
- for_each_member(j, t, member) {
- const struct btf_type *func_proto;
+ /* Fill the information of the new argument */
+ info->reg_type =
+ PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ info->btf_id = arg_btf_id;
+ info->btf = btf;
+ info->offset = offset;
- mname = btf_name_by_offset(btf, member->name_off);
- if (!*mname) {
- pr_warn("anon member in struct %s is not supported\n",
- st_ops->name);
- break;
- }
+ info++;
+ info_cnt++;
+ }
- if (__btf_member_bitfield_size(t, member)) {
- pr_warn("bit field member %s in struct %s is not supported\n",
- mname, st_ops->name);
- break;
- }
+ if (info_cnt) {
+ arg_info->info = info_buf;
+ arg_info->cnt = info_cnt;
+ } else {
+ kfree(info_buf);
+ }
- func_proto = btf_type_resolve_func_ptr(btf,
- member->type,
- NULL);
- if (func_proto &&
- btf_distill_func_proto(log, btf,
- func_proto, mname,
- &st_ops->func_models[j])) {
- pr_warn("Error in parsing func ptr %s in struct %s\n",
- mname, st_ops->name);
- break;
- }
- }
+ return 0;
- if (j == btf_type_vlen(t)) {
- if (st_ops->init(btf)) {
- pr_warn("Error in init bpf_struct_ops %s\n",
- st_ops->name);
- } else {
- st_ops->type_id = type_id;
- st_ops->type = t;
- st_ops->value_id = value_id;
- st_ops->value_type = btf_type_by_id(btf,
- value_id);
- }
- }
- }
+err_out:
+ kfree(info_buf);
+
+ return -EINVAL;
}
-extern struct btf *btf_vmlinux;
+/* Clean up the arg_info in a struct bpf_struct_ops_desc. */
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+ struct bpf_struct_ops_arg_info *arg_info;
+ int i;
-static const struct bpf_struct_ops *
-bpf_struct_ops_find_value(u32 value_id)
+ arg_info = st_ops_desc->arg_info;
+ for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
+ kfree(arg_info[i].info);
+
+ kfree(arg_info);
+}
+
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+ struct btf *btf,
+ struct bpf_verifier_log *log)
{
- unsigned int i;
+ struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+ struct bpf_struct_ops_arg_info *arg_info;
+ const struct btf_member *member;
+ const struct btf_type *t;
+ s32 type_id, value_id;
+ char value_name[128];
+ const char *mname;
+ int i, err;
- if (!value_id || !btf_vmlinux)
- return NULL;
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ return -EINVAL;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->value_id == value_id)
- return bpf_struct_ops[i];
+ if (!st_ops->cfi_stubs) {
+ pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name);
+ return -EINVAL;
}
- return NULL;
-}
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ st_ops->name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ return -EINVAL;
+ }
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
-{
- unsigned int i;
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ value_name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ if (!is_valid_value_type(btf, value_id, t, value_name))
+ return -EINVAL;
- if (!type_id || !btf_vmlinux)
- return NULL;
+ arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info),
+ GFP_KERNEL);
+ if (!arg_info)
+ return -ENOMEM;
+
+ st_ops_desc->arg_info = arg_info;
+ st_ops_desc->type = t;
+ st_ops_desc->type_id = type_id;
+ st_ops_desc->value_id = value_id;
+ st_ops_desc->value_type = btf_type_by_id(btf, value_id);
+
+ for_each_member(i, t, member) {
+ const struct btf_type *func_proto;
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->type_id == type_id)
- return bpf_struct_ops[i];
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (__btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (!func_proto)
+ continue;
+
+ if (btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[i])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ err = prepare_arg_info(btf, st_ops->name, mname,
+ func_proto,
+ arg_info + i);
+ if (err)
+ goto errout;
}
- return NULL;
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ return 0;
+
+errout:
+ bpf_struct_ops_desc_release(st_ops_desc);
+
+ return err;
}
static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
@@ -265,7 +446,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
kvalue = &st_map->kvalue;
/* Pair with smp_store_release() during map_update */
- state = smp_load_acquire(&kvalue->state);
+ state = smp_load_acquire(&kvalue->common.state);
if (state == BPF_STRUCT_OPS_STATE_INIT) {
memset(value, 0, map->value_size);
return 0;
@@ -276,7 +457,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
*/
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
- uvalue->state = state;
+ uvalue->common.state = state;
/* This value offers the user space a general estimate of how
* many sockets are still utilizing this struct_ops for TCP
@@ -284,7 +465,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
* should sufficiently meet our present goals.
*/
refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
- refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
+ refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -296,10 +477,9 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
- const struct btf_type *t = st_map->st_ops->type;
u32 i;
- for (i = 0; i < btf_type_vlen(t); i++) {
+ for (i = 0; i < st_map->links_cnt; i++) {
if (st_map->links[i]) {
bpf_link_put(st_map->links[i]);
st_map->links[i] = NULL;
@@ -307,7 +487,16 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
}
}
-static int check_zero_holes(const struct btf_type *t, void *data)
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+ int i;
+
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ bpf_struct_ops_image_free(st_map->image_pages[i]);
+ st_map->image_pages_cnt = 0;
+}
+
+static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{
const struct btf_member *member;
u32 i, moff, msize, prev_mend = 0;
@@ -319,8 +508,8 @@ static int check_zero_holes(const struct btf_type *t, void *data)
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(btf, member->type);
+ mtype = btf_resolve_size(btf, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -352,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
- void *stub_func, void *image, void *image_end)
+ void *stub_func,
+ void **_image, u32 *_image_off,
+ bool allow_alloc)
{
- u32 flags = BPF_TRAMP_F_INDIRECT;
+ u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+ void *image = *_image;
int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
@@ -364,27 +556,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
- if (size < 0)
- return size;
- if (size > (unsigned long)image_end - (unsigned long)image)
- return -E2BIG;
- return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ if (size <= 0)
+ return size ? : -EFAULT;
+
+ /* Allocate image buffer if necessary */
+ if (!image || size > PAGE_SIZE - image_off) {
+ if (!allow_alloc)
+ return -E2BIG;
+
+ image = bpf_struct_ops_image_alloc();
+ if (IS_ERR(image))
+ return PTR_ERR(image);
+ image_off = 0;
+ }
+
+ size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+ image + PAGE_SIZE,
model, flags, tlinks, stub_func);
+ if (size <= 0) {
+ if (image != *_image)
+ bpf_struct_ops_image_free(image);
+ return size ? : -EFAULT;
+ }
+
+ *_image = image;
+ *_image_off = image_off + size;
+ return 0;
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_type *module_type;
const struct btf_member *member;
- const struct btf_type *t = st_ops->type;
+ const struct btf_type *t = st_ops_desc->type;
struct bpf_tramp_links *tlinks;
void *udata, *kdata;
int prog_fd, err;
- void *image, *image_end;
- u32 i;
+ u32 i, trampoline_start, image_off = 0;
+ void *cur_image = NULL, *image = NULL;
if (flags)
return -EINVAL;
@@ -392,16 +606,16 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (*(u32 *)key != 0)
return -E2BIG;
- err = check_zero_holes(st_ops->value_type, value);
+ err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value);
if (err)
return err;
uvalue = value;
- err = check_zero_holes(t, uvalue->data);
+ err = check_zero_holes(st_map->btf, t, uvalue->data);
if (err)
return err;
- if (uvalue->state || refcount_read(&uvalue->refcnt))
+ if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
return -EINVAL;
tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
@@ -413,7 +627,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
mutex_lock(&st_map->lock);
- if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) {
err = -EBUSY;
goto unlock;
}
@@ -422,9 +636,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
- image = st_map->image;
- image_end = st_map->image + PAGE_SIZE;
+ module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
@@ -432,7 +645,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
- ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
goto reset_unlock;
@@ -457,8 +670,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (!ptype || !btf_type_is_func_proto(ptype)) {
u32 msize;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(st_map->btf, member->type);
+ mtype = btf_resolve_size(st_map->btf, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
@@ -484,7 +697,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
- prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->aux->attach_btf_id != st_ops_desc->type_id ||
prog->expected_attach_type != i) {
bpf_prog_put(prog);
err = -EINVAL;
@@ -501,37 +714,47 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&bpf_struct_ops_link_lops, prog);
st_map->links[i] = &link->link;
+ trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
- &st_ops->func_models[i],
- *(void **)(st_ops->cfi_stubs + moff),
- image, image_end);
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ &image, &image_off,
+ st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+ if (err)
+ goto reset_unlock;
+
+ if (cur_image != image) {
+ st_map->image_pages[st_map->image_pages_cnt++] = image;
+ cur_image = image;
+ trampoline_start = 0;
+ }
if (err < 0)
goto reset_unlock;
- *(void **)(kdata + moff) = image + cfi_get_offset();
- image += err;
+ *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
}
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
- if (st_ops->validate) {
- err = st_ops->validate(kdata);
- if (err)
- goto reset_unlock;
- }
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
*/
- smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY);
goto unlock;
}
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
@@ -545,7 +768,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
*/
- smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE);
goto unlock;
}
@@ -554,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
+ bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
@@ -575,12 +798,12 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
if (st_map->map.map_flags & BPF_F_LINK)
return -EOPNOTSUPP;
- prev_state = cmpxchg(&st_map->kvalue.state,
+ prev_state = cmpxchg(&st_map->kvalue.common.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -597,6 +820,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
void *value;
int err;
@@ -606,7 +830,8 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
if (!err) {
- btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ btf_type_seq_show(st_map->btf,
+ map->btf_vmlinux_value_type_id,
value, m);
seq_puts(m, "\n");
}
@@ -621,16 +846,22 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
- if (st_map->image) {
- arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
- bpf_jit_uncharge_modmem(PAGE_SIZE);
- }
+ bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ /* st_ops->owner was acquired during map_alloc to implicitly holds
+ * the btf's refcnt. The acquire was only done when btf_is_module()
+ * st_map->btf cannot be NULL here.
+ */
+ if (btf_is_module(st_map->btf))
+ module_put(st_map->st_ops_desc->st_ops->owner);
+
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
@@ -654,29 +885,61 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) ||
+ !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
+ struct module *mod = NULL;
struct bpf_map *map;
+ struct btf *btf;
int ret;
- st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
- if (!st_ops)
- return ERR_PTR(-ENOTSUPP);
+ if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
+ /* The map holds btf for its whole life time. */
+ btf = btf_get_by_fd(attr->value_type_btf_obj_fd);
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf_is_module(btf)) {
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ /* mod holds a refcnt to btf. We don't need an extra refcnt
+ * here.
+ */
+ btf_put(btf);
+ if (!mod)
+ return ERR_PTR(-EINVAL);
+ } else {
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf)
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
+ if (!st_ops_desc) {
+ ret = -ENOTSUPP;
+ goto errout;
+ }
- vt = st_ops->value_type;
- if (attr->value_size != vt->size)
- return ERR_PTR(-EINVAL);
+ vt = st_ops_desc->value_type;
+ if (attr->value_size != vt->size) {
+ ret = -EINVAL;
+ goto errout;
+ }
- t = st_ops->type;
+ t = st_ops_desc->type;
st_map_size = sizeof(*st_map) +
/* kvalue stores the
@@ -685,48 +948,43 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
(vt->size - sizeof(struct bpf_struct_ops_value));
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
- if (!st_map)
- return ERR_PTR(-ENOMEM);
+ if (!st_map) {
+ ret = -ENOMEM;
+ goto errout;
+ }
- st_map->st_ops = st_ops;
+ st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
- ret = bpf_jit_charge_modmem(PAGE_SIZE);
- if (ret) {
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(ret);
- }
-
- st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
- if (!st_map->image) {
- /* __bpf_struct_ops_map_free() uses st_map->image as flag
- * for "charged or not". In this case, we need to unchange
- * here.
- */
- bpf_jit_uncharge_modmem(PAGE_SIZE);
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(-ENOMEM);
- }
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->links_cnt = btf_type_vlen(t);
st_map->links =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
+ bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *),
NUMA_NO_NODE);
if (!st_map->uvalue || !st_map->links) {
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto errout_free;
}
+ st_map->btf = btf;
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
return map;
+
+errout_free:
+ __bpf_struct_ops_map_free(map);
+errout:
+ module_put(mod);
+
+ return ERR_PTR(ret);
}
static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- const struct bpf_struct_ops *st_ops = st_map->st_ops;
- const struct btf_type *vt = st_ops->value_type;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct btf_type *vt = st_ops_desc->value_type;
u64 usage;
usage = sizeof(*st_map) +
@@ -785,7 +1043,7 @@ static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
map->map_flags & BPF_F_LINK &&
/* Pair with smp_store_release() during map_update */
- smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY;
}
static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
@@ -800,7 +1058,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
/* st_link->map can be NULL if
* bpf_struct_ops_link_create() fails to register.
*/
- st_map->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
bpf_map_put(&st_map->map);
}
kfree(st_link);
@@ -847,7 +1105,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
if (!bpf_struct_ops_valid_to_reg(new_map))
return -EINVAL;
- if (!st_map->st_ops->update)
+ if (!st_map->st_ops_desc->st_ops->update)
return -EOPNOTSUPP;
mutex_lock(&update_mutex);
@@ -860,12 +1118,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
/* The new and old struct_ops must be the same type. */
- if (st_map->st_ops != old_st_map->st_ops) {
+ if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
err = -EINVAL;
goto err_out;
}
- err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
if (err)
goto err_out;
@@ -916,7 +1174,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;
- err = st_map->st_ops->reg(st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -931,3 +1189,10 @@ err_out:
kfree(link);
return err;
}
+
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ info->btf_vmlinux_id = btf_obj_id(st_map->btf);
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
deleted file mode 100644
index 5678a9ddf817..000000000000
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* internal file - do not include directly */
-
-#ifdef CONFIG_BPF_JIT
-#ifdef CONFIG_NET
-BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
-#endif
-#ifdef CONFIG_INET
-#include <net/tcp.h>
-BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
-#endif
-#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 596471189176..90c4a32d89ff 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf.h>
#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
@@ -241,6 +242,12 @@ struct btf_id_dtor_kfunc_tab {
struct btf_id_dtor_kfunc dtors[];
};
+struct btf_struct_ops_tab {
+ u32 cnt;
+ u32 capacity;
+ struct bpf_struct_ops_desc ops[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -258,6 +265,7 @@ struct btf {
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
struct btf_struct_metas *struct_meta_tab;
+ struct btf_struct_ops_tab *struct_ops_tab;
/* split BTF support */
struct btf *base_btf;
@@ -801,9 +809,23 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
return __btf_name_valid(btf, offset);
}
+/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset);
+ /* offset must be valid */
+ const char *src = btf_str_by_offset(btf, offset);
+ const char *src_limit;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ src++;
+ while (*src && src < src_limit) {
+ if (!isprint(*src))
+ return false;
+ src++;
+ }
+
+ return !*src;
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -1688,11 +1710,27 @@ static void btf_free_struct_meta_tab(struct btf *btf)
btf->struct_meta_tab = NULL;
}
+static void btf_free_struct_ops_tab(struct btf *btf)
+{
+ struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
+ u32 i;
+
+ if (!tab)
+ return;
+
+ for (i = 0; i < tab->cnt; i++)
+ bpf_struct_ops_desc_release(&tab->ops[i]);
+
+ kfree(tab);
+ btf->struct_ops_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
+ btf_free_struct_ops_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -1707,6 +1745,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
+const char *btf_get_name(const struct btf *btf)
+{
+ return btf->name;
+}
+
void btf_get(struct btf *btf)
{
refcount_inc(&btf->refcnt);
@@ -3310,30 +3353,48 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND;
}
-const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
- int comp_idx, const char *tag_key)
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key, int last_id)
{
- const char *value = NULL;
- int i;
+ int len = strlen(tag_key);
+ int i, n;
- for (i = 1; i < btf_nr_types(btf); i++) {
+ for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
const struct btf_type *t = btf_type_by_id(btf, i);
- int len = strlen(tag_key);
if (!btf_type_is_decl_tag(t))
continue;
- if (pt != btf_type_by_id(btf, t->type) ||
- btf_type_decl_tag(t)->component_idx != comp_idx)
+ if (pt != btf_type_by_id(btf, t->type))
+ continue;
+ if (btf_type_decl_tag(t)->component_idx != comp_idx)
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
- /* Prevent duplicate entries for same type */
- if (value)
- return ERR_PTR(-EEXIST);
- value = __btf_name_by_offset(btf, t->name_off) + len;
+ return i;
}
- if (!value)
- return ERR_PTR(-ENOENT);
+ return -ENOENT;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ const char *value = NULL;
+ const struct btf_type *t;
+ int len, id;
+
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ if (id < 0)
+ return ERR_PTR(id);
+
+ t = btf_type_by_id(btf, id);
+ len = strlen(tag_key);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
+
+ /* Prevent duplicate entries for same type */
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
+ if (id >= 0)
+ return ERR_PTR(-EEXIST);
+
return value;
}
@@ -5647,15 +5708,29 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
return ctx_type->type;
}
-const struct btf_type *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_type *ctx_type;
const char *tname, *ctx_tname;
t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to
+ * check before we skip all the typedef below.
+ */
+ if (prog_type == BPF_PROG_TYPE_KPROBE) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return true;
+ }
+ }
+
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_struct(t)) {
@@ -5664,27 +5739,30 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- return NULL;
+ return false;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
- return NULL;
+ return false;
}
ctx_type = find_canonical_prog_ctx_type(prog_type);
if (!ctx_type) {
bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
- return NULL;
+ return false;
}
again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
- return NULL;
+ return false;
}
+ /* program types without named context types work only with arg:ctx tag */
+ if (ctx_tname[0] == '\0')
+ return false;
/* only compare that prog's ctx type name is the same as
* kernel expects. No need to compare field by field.
* It's ok for bpf prog to do:
@@ -5693,20 +5771,20 @@ again:
* { // no fields of skb are ever used }
*/
if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
- return ctx_type;
+ return true;
if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
- return ctx_type;
+ return true;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
*/
if (!btf_type_is_modifier(ctx_type))
- return NULL;
+ return false;
while (btf_type_is_modifier(ctx_type))
ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
goto again;
}
- return ctx_type;
+ return true;
}
/* forward declarations for arch-specific underlying types of
@@ -5858,7 +5936,7 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
enum bpf_prog_type prog_type,
int arg)
{
- if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
+ if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
return find_kern_ctx_type_id(prog_type);
}
@@ -5933,8 +6011,6 @@ struct btf *btf_parse_vmlinux(void)
/* btf_parse_vmlinux() runs under bpf_verifier_lock */
bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
- bpf_struct_ops_init(btf, log);
-
refcount_set(&btf->refcnt, 1);
err = btf_alloc_id(btf);
@@ -6092,6 +6168,26 @@ static bool prog_args_trusted(const struct bpf_prog *prog)
}
}
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+ u32 arg_no)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ int off = 0, i;
+ u32 sz;
+
+ args = btf_params(func_proto);
+ for (i = 0; i < arg_no; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ t = btf_resolve_size(btf, t, &sz);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ off += roundup(sz, 8);
+ }
+
+ return off;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -6228,7 +6324,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = ctx_arg_info->reg_type;
- info->btf = btf_vmlinux;
+ info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
return true;
}
@@ -6284,6 +6380,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
__btf_name_by_offset(btf, t->name_off));
return true;
}
+EXPORT_SYMBOL_GPL(btf_ctx_access);
enum bpf_struct_walk_result {
/* < 0 error */
@@ -6946,6 +7043,81 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
return false;
}
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);
+
+static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
+ const struct btf *btf, const struct btf_type *t)
+{
+ struct bpf_cand_cache *cc;
+ struct bpf_core_ctx ctx = {
+ .btf = btf,
+ .log = log,
+ };
+ u32 kern_type_id, type_id;
+ int err = 0;
+
+ /* skip PTR and modifiers */
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t)) {
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(&ctx, type_id);
+ if (IS_ERR(cc)) {
+ err = PTR_ERR(cc);
+ bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ err);
+ goto cand_cache_unlock;
+ }
+ if (cc->cnt != 1) {
+ bpf_log(log, "arg#%d reference type('%s %s') %s\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ cc->cnt == 0 ? "has no matches" : "is ambiguous");
+ err = cc->cnt == 0 ? -ENOENT : -ESRCH;
+ goto cand_cache_unlock;
+ }
+ if (btf_is_module(cc->cands[0].btf)) {
+ bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
+ err = -EOPNOTSUPP;
+ goto cand_cache_unlock;
+ }
+ kern_type_id = cc->cands[0].id;
+
+cand_cache_unlock:
+ mutex_unlock(&cand_cache_mutex);
+ if (err)
+ return err;
+
+ return kern_type_id;
+}
+
+enum btf_arg_tag {
+ ARG_TAG_CTX = BIT_ULL(0),
+ ARG_TAG_NONNULL = BIT_ULL(1),
+ ARG_TAG_TRUSTED = BIT_ULL(2),
+ ARG_TAG_NULLABLE = BIT_ULL(3),
+ ARG_TAG_ARENA = BIT_ULL(4),
+};
+
/* Process BTF of a function to produce high-level expectation of function
* arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
* is cached in subprog info for reuse.
@@ -7009,6 +7181,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
@@ -7018,6 +7192,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -7027,92 +7203,134 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
- bool is_nonnull = false;
- const char *tag;
+ u32 tags = 0;
+ int id = 0;
- t = btf_type_by_id(btf, args[i].type);
-
- tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
- if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
- tag = NULL;
- } else if (IS_ERR(tag)) {
- bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
- return PTR_ERR(tag);
- }
/* 'arg:<tag>' decl_tag takes precedence over derivation of
* register type from BTF type itself
*/
- if (tag) {
+ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
+ const struct btf_type *tag_t = btf_type_by_id(btf, id);
+ const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
+
/* disallow arg tags in static subprogs */
if (!is_global) {
bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
return -EOPNOTSUPP;
}
+
if (strcmp(tag, "ctx") == 0) {
- sub->args[i].arg_type = ARG_PTR_TO_CTX;
- continue;
+ tags |= ARG_TAG_CTX;
+ } else if (strcmp(tag, "trusted") == 0) {
+ tags |= ARG_TAG_TRUSTED;
+ } else if (strcmp(tag, "nonnull") == 0) {
+ tags |= ARG_TAG_NONNULL;
+ } else if (strcmp(tag, "nullable") == 0) {
+ tags |= ARG_TAG_NULLABLE;
+ } else if (strcmp(tag, "arena") == 0) {
+ tags |= ARG_TAG_ARENA;
+ } else {
+ bpf_log(log, "arg#%d has unsupported set of tags\n", i);
+ return -EOPNOTSUPP;
}
- if (strcmp(tag, "nonnull") == 0)
- is_nonnull = true;
+ }
+ if (id != -ENOENT) {
+ bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
+ return id;
}
+ t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_is_any_enum(t)) {
- sub->args[i].arg_type = ARG_ANYTHING;
- continue;
- }
- if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (!btf_type_is_ptr(t))
+ goto skip_pointer;
+
+ if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (tags & ~ARG_TAG_CTX) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ if ((tags & ARG_TAG_CTX) &&
+ btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
sub->args[i].arg_type = ARG_PTR_TO_CTX;
continue;
}
- if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) {
+ if (btf_is_dynptr_ptr(btf, t)) {
+ if (tags) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
continue;
}
- if (is_global && btf_type_is_ptr(t)) {
+ if (tags & ARG_TAG_TRUSTED) {
+ int kern_type_id;
+
+ if (tags & ARG_TAG_NONNULL) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
+ if (tags & ARG_TAG_NULLABLE)
+ sub->args[i].arg_type |= PTR_MAYBE_NULL;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
+ if (tags & ARG_TAG_ARENA) {
+ if (tags & ~ARG_TAG_ARENA) {
+ bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+ continue;
+ }
+ if (is_global) { /* generic user data pointer */
u32 mem_size;
+ if (tags & ARG_TAG_NULLABLE) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
t = btf_type_skip_modifiers(btf, t->type, NULL);
ref_t = btf_resolve_size(btf, t, &mem_size);
if (IS_ERR(ref_t)) {
- bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
PTR_ERR(ref_t));
return -EINVAL;
}
- sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL;
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL;
+ if (tags & ARG_TAG_NONNULL)
+ sub->args[i].arg_type &= ~PTR_MAYBE_NULL;
sub->args[i].mem_size = mem_size;
continue;
}
- if (is_nonnull) {
- bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i);
+
+skip_pointer:
+ if (tags) {
+ bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i);
return -EINVAL;
}
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
+ sub->args[i].arg_type = ARG_ANYTHING;
+ continue;
+ }
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
i, btf_type_str(t), tname);
return -EINVAL;
}
- for (i = 0; i < nargs; i++) {
- const char *tag;
-
- if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
- continue;
-
- /* check if arg has "arg:ctx" tag */
- t = btf_type_by_id(btf, args[i].type);
- tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
- if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
- continue;
-
- if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
- prog->expected_attach_type))
- return -EINVAL;
- }
-
sub->arg_cnt = nargs;
sub->args_cached = true;
@@ -7589,6 +7807,17 @@ static struct btf *btf_get_module_btf(const struct module *module)
return btf;
}
+static int check_btf_kconfigs(const struct module *module, const char *feature)
+{
+ if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register %s\n", feature);
+ return -ENOENT;
+ }
+ if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register %s\n", feature);
+ return 0;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
struct btf *btf = NULL;
@@ -7949,15 +8178,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
int ret, i;
btf = btf_get_module_btf(kset->owner);
- if (!btf) {
- if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- pr_err("missing vmlinux BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
- if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
- pr_warn("missing module BTF, cannot register kfuncs\n");
- return 0;
- }
+ if (!btf)
+ return check_btf_kconfigs(kset->owner, "kfunc");
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -7981,6 +8203,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
{
enum btf_kfunc_hook hook;
+ /* All kfuncs need to be tagged as such in BTF.
+ * WARN() for initcall registrations that do not check errors.
+ */
+ if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
+ WARN_ON(!kset->owner);
+ return -EINVAL;
+ }
+
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __register_btf_kfunc_id_set(hook, kset);
}
@@ -8057,17 +8287,8 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
int ret;
btf = btf_get_module_btf(owner);
- if (!btf) {
- if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
- return -ENOENT;
- }
- if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
- pr_err("missing module BTF, cannot register dtor kfuncs\n");
- return -ENOENT;
- }
- return 0;
- }
+ if (!btf)
+ return check_btf_kconfigs(owner, "dtor kfuncs");
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -8182,17 +8403,6 @@ size_t bpf_core_essential_name_len(const char *name)
return n;
}
-struct bpf_cand_cache {
- const char *name;
- u32 name_len;
- u16 kind;
- u16 cnt;
- struct {
- const struct btf *btf;
- u32 id;
- } cands[];
-};
-
static void bpf_free_cands(struct bpf_cand_cache *cands)
{
if (!cands->cnt)
@@ -8213,8 +8423,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
-static DEFINE_MUTEX(cand_cache_mutex);
-
static void __print_cand_cache(struct bpf_verifier_log *log,
struct bpf_cand_cache **cache,
int cache_size)
@@ -8645,3 +8853,141 @@ bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
return !strncmp(reg_name, arg_name, cmp_len);
}
+
+#ifdef CONFIG_BPF_JIT
+static int
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+ struct bpf_verifier_log *log)
+{
+ struct btf_struct_ops_tab *tab, *new_tab;
+ int i, err;
+
+ tab = btf->struct_ops_tab;
+ if (!tab) {
+ tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]),
+ GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ tab->capacity = 4;
+ btf->struct_ops_tab = tab;
+ }
+
+ for (i = 0; i < tab->cnt; i++)
+ if (tab->ops[i].st_ops == st_ops)
+ return -EEXIST;
+
+ if (tab->cnt == tab->capacity) {
+ new_tab = krealloc(tab,
+ offsetof(struct btf_struct_ops_tab,
+ ops[tab->capacity * 2]),
+ GFP_KERNEL);
+ if (!new_tab)
+ return -ENOMEM;
+ tab = new_tab;
+ tab->capacity *= 2;
+ btf->struct_ops_tab = tab;
+ }
+
+ tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
+
+ err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+ if (err)
+ return err;
+
+ btf->struct_ops_tab->cnt++;
+
+ return 0;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!value_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].value_id == value_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!type_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].type_id == type_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err = 0;
+
+ btf = btf_get_module_btf(st_ops->owner);
+ if (!btf)
+ return check_btf_kconfigs(st_ops->owner, "struct_ops");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+ if (!log) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ log->level = BPF_LOG_KERNEL;
+
+ err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+ kfree(log);
+ btf_put(btf);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
+#endif
+
+bool btf_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len <= suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..82243cb6c54d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1358,15 +1358,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum cgroup_bpf_attach_type atype)
{
- unsigned int offset = skb->data - skb_network_header(skb);
+ unsigned int offset = -skb_network_offset(skb);
struct sock *save_sk;
void *saved_data_end;
struct cgroup *cgrp;
int ret;
- if (!sk || !sk_fullsock(sk))
- return 0;
-
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
@@ -1630,7 +1627,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2191,7 +2188,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2348,7 +2345,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ea6843be2616..696bc55de8e8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -88,13 +88,18 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
+/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
+enum page_size_enum {
+ __PAGE_SIZE = PAGE_SIZE
+};
+
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
- size = round_up(size, PAGE_SIZE);
+ size = round_up(size, __PAGE_SIZE);
fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
@@ -682,7 +687,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !bpf_capable())
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
return;
bpf_prog_ksym_set_addr(fp);
@@ -888,7 +893,12 @@ static LIST_HEAD(pack_list);
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
*/
#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif
@@ -1675,6 +1685,7 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
@@ -2695,7 +2706,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
bool sleepable;
u32 i;
- sleepable = aux->sleepable;
+ sleepable = aux->prog->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
@@ -2779,6 +2790,7 @@ void bpf_prog_free(struct bpf_prog *fp)
if (aux->dst_prog)
bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
@@ -2925,6 +2937,21 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
return false;
}
+bool __weak bpf_jit_supports_arena(void)
+{
+ return false;
+}
+
+/* Return TRUE if the JIT backend satisfies the following two conditions:
+ * 1) JIT backend supports atomic_xchg() on pointer-sized words.
+ * 2) Under the specific arch, the implementation of xchg() is the same
+ * as atomic_xchg() on pointer-sized words.
+ */
+bool __weak bpf_jit_supports_ptr_xchg(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2959,6 +2986,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
{
}
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef82ffc90cbe..a8e34416e960 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
#include <linux/filter.h>
#include <linux/ptr_ring.h>
#include <net/xdp.h>
+#include <net/hotdata.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
@@ -262,6 +263,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -287,10 +289,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
@@ -326,7 +330,8 @@ static int cpu_map_kthread_run(void *data)
/* Support running another XDP prog on this CPU */
nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
+ m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ gfp, nframes, skbs);
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811..dad0fb1c8e87 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -424,7 +424,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
__bpf_kfunc_end_defs();
-BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
@@ -450,7 +450,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
-BTF_SET8_END(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a936c704d4e7..4e2cdbb5629f 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -130,13 +130,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
- if (!dtab->n_buckets) /* Overflow check */
+ /* hash table size must be power of 2; roundup_pow_of_two() can
+ * overflow into UB on 32-bit arches, so check that first
+ */
+ if (dtab->map.max_entries > 1UL << 31)
return -EINVAL;
- }
- if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 49940c26a227..bd2e2dd04740 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn)
(insn->off == 8 || insn->off == 16 || insn->off == 32);
}
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+ insn->off == BPF_ADDR_SPACE_CAST;
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
+ } else if (is_addr_space_cast(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+ insn->code, insn->dst_reg,
+ insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
} else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
@@ -322,6 +332,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+ insn->src_reg == BPF_MAY_GOTO) {
+ verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+ insn->code, insn->off);
} else if (insn->code == (BPF_JMP32 | BPF_JA)) {
verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
insn->code, insn->imm);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 03a6a2500b6a..3a088a5349bc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -499,7 +499,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
num_possible_cpus());
}
- /* hash table size must be power of 2 */
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ err = -E2BIG;
+ if (htab->map.max_entries > 1UL << 31)
+ goto free_htab;
+
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
htab->elem_size = sizeof(struct htab_elem) +
@@ -509,10 +515,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
- err = -E2BIG;
- /* prevent zero size kmalloc and check for u32 overflow */
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ /* check for u32 overflow */
+ if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
err = bpf_map_init_elem_count(&htab->map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d19cd863d294..a89587859571 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -334,7 +334,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
__this_cpu_write(irqsave_flags, flags);
}
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
__bpf_spin_lock_irqsave(lock);
return 0;
@@ -357,7 +357,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
local_irq_restore(flags);
}
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
__bpf_spin_unlock_irqrestore(lock);
return 0;
@@ -1417,6 +1417,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
{
unsigned long *kptr = map_value;
+ /* This helper may be inlined by verifier. */
return xchg(kptr, (unsigned long)ptr);
}
@@ -1682,7 +1683,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -1733,7 +1734,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return NULL;
switch (func_id) {
@@ -1791,7 +1792,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
switch (func_id) {
@@ -2486,9 +2487,9 @@ __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
return obj;
}
-__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
{
- return obj__ign;
+ return (void *)obj__ign;
}
__bpf_kfunc void bpf_rcu_read_lock(void)
@@ -2546,7 +2547,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
__bpf_kfunc_end_defs();
-BTF_SET8_START(generic_btf_ids)
+BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_KEXEC_CORE
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
@@ -2575,7 +2576,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
-BTF_SET8_END(generic_btf_ids)
+BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
.owner = THIS_MODULE,
@@ -2591,7 +2592,7 @@ BTF_ID(struct, cgroup)
BTF_ID(func, bpf_cgroup_release_dtor)
#endif
-BTF_SET8_START(common_btf_ids)
+BTF_KFUNCS_START(common_btf_ids)
BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
BTF_ID_FLAGS(func, bpf_rdonly_cast)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
@@ -2620,7 +2621,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
-BTF_SET8_END(common_btf_ids)
+BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 41e0a55c35f5..af5d2ffadd70 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
#include "preload/bpf_preload.h"
enum bpf_type {
@@ -98,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
static const struct inode_operations bpf_map_iops = { };
static const struct inode_operations bpf_link_iops = { };
-static struct inode *bpf_get_inode(struct super_block *sb,
- const struct inode *dir,
- umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
{
struct inode *inode;
@@ -594,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
+struct bpffs_btf_enums {
+ const struct btf *btf;
+ const struct btf_type *cmd_t;
+ const struct btf_type *map_t;
+ const struct btf_type *prog_t;
+ const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+ const struct btf *btf;
+ const struct btf_type *t;
+ const char *name;
+ int i, n;
+
+ memset(info, 0, sizeof(*info));
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -ENOENT;
+
+ info->btf = btf;
+
+ for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (!btf_type_is_enum(t))
+ continue;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name)
+ continue;
+
+ if (strcmp(name, "bpf_cmd") == 0)
+ info->cmd_t = t;
+ else if (strcmp(name, "bpf_map_type") == 0)
+ info->map_t = t;
+ else if (strcmp(name, "bpf_prog_type") == 0)
+ info->prog_t = t;
+ else if (strcmp(name, "bpf_attach_type") == 0)
+ info->attach_t = t;
+ else
+ continue;
+
+ if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+ const char *prefix, const char *str, int *value)
+{
+ const struct btf_enum *e;
+ const char *name;
+ int i, n, pfx_len = strlen(prefix);
+
+ *value = 0;
+
+ if (!btf || !enum_t)
+ return false;
+
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+
+ /* match symbolic name case insensitive and ignoring prefix */
+ if (strcasecmp(name + pfx_len, str) == 0) {
+ *value = e->val;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+ const char *opt_name,
+ const struct btf *btf,
+ const struct btf_type *enum_t,
+ const char *prefix,
+ u64 delegate_msk, u64 any_msk)
+{
+ const struct btf_enum *e;
+ bool first = true;
+ const char *name;
+ u64 msk;
+ int i, n, pfx_len = strlen(prefix);
+
+ delegate_msk &= any_msk; /* clear unknown bits */
+
+ if (delegate_msk == 0)
+ return;
+
+ seq_printf(m, ",%s", opt_name);
+ if (delegate_msk == any_msk) {
+ seq_printf(m, "=any");
+ return;
+ }
+
+ if (btf && enum_t) {
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+ msk = 1ULL << e->val;
+ if (delegate_msk & msk) {
+ /* emit lower-case name without prefix */
+ seq_printf(m, "%c", first ? '=' : ':');
+ name += pfx_len;
+ while (*name) {
+ seq_printf(m, "%c", tolower(*name));
+ name++;
+ }
+
+ delegate_msk &= ~msk;
+ first = false;
+ }
+ }
+ }
+ if (delegate_msk)
+ seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
/*
* Display the mount options in /proc/mounts.
*/
@@ -601,6 +732,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
struct inode *inode = d_inode(root);
umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+ u64 mask;
if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
seq_printf(m, ",uid=%u",
@@ -610,6 +743,35 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
+
+ if (opts->delegate_cmds || opts->delegate_maps ||
+ opts->delegate_progs || opts->delegate_attachs) {
+ struct bpffs_btf_enums info;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ seq_print_delegate_opts(m, "delegate_cmds",
+ info.btf, info.cmd_t, "BPF_",
+ opts->delegate_cmds, mask);
+
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_maps",
+ info.btf, info.map_t, "BPF_MAP_TYPE_",
+ opts->delegate_maps, mask);
+
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_progs",
+ info.btf, info.prog_t, "BPF_PROG_TYPE_",
+ opts->delegate_progs, mask);
+
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_attachs",
+ info.btf, info.attach_t, "BPF_",
+ opts->delegate_attachs, mask);
+ }
+
return 0;
}
@@ -624,7 +786,7 @@ static void bpf_free_inode(struct inode *inode)
free_inode_nonrcu(inode);
}
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
@@ -635,28 +797,30 @@ enum {
OPT_UID,
OPT_GID,
OPT_MODE,
+ OPT_DELEGATE_CMDS,
+ OPT_DELEGATE_MAPS,
+ OPT_DELEGATE_PROGS,
+ OPT_DELEGATE_ATTACHS,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
fsparam_u32 ("uid", OPT_UID),
fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
+ fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS),
+ fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS),
+ fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS),
+ fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS),
{}
};
-struct bpf_mount_opts {
- kuid_t uid;
- kgid_t gid;
- umode_t mode;
-};
-
static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = fc->s_fs_info;
struct fs_parse_result result;
kuid_t uid;
kgid_t gid;
- int opt;
+ int opt, err;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
if (opt < 0) {
@@ -708,6 +872,67 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
+ case OPT_DELEGATE_CMDS:
+ case OPT_DELEGATE_MAPS:
+ case OPT_DELEGATE_PROGS:
+ case OPT_DELEGATE_ATTACHS: {
+ struct bpffs_btf_enums info;
+ const struct btf_type *enum_t;
+ const char *enum_pfx;
+ u64 *delegate_msk, msk = 0;
+ char *p;
+ int val;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ switch (opt) {
+ case OPT_DELEGATE_CMDS:
+ delegate_msk = &opts->delegate_cmds;
+ enum_t = info.cmd_t;
+ enum_pfx = "BPF_";
+ break;
+ case OPT_DELEGATE_MAPS:
+ delegate_msk = &opts->delegate_maps;
+ enum_t = info.map_t;
+ enum_pfx = "BPF_MAP_TYPE_";
+ break;
+ case OPT_DELEGATE_PROGS:
+ delegate_msk = &opts->delegate_progs;
+ enum_t = info.prog_t;
+ enum_pfx = "BPF_PROG_TYPE_";
+ break;
+ case OPT_DELEGATE_ATTACHS:
+ delegate_msk = &opts->delegate_attachs;
+ enum_t = info.attach_t;
+ enum_pfx = "BPF_";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ while ((p = strsep(&param->string, ":"))) {
+ if (strcmp(p, "any") == 0) {
+ msk |= ~0ULL;
+ } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+ msk |= 1ULL << val;
+ } else {
+ err = kstrtou64(p, 0, &msk);
+ if (err)
+ return err;
+ }
+ }
+
+ /* Setting delegation mount options requires privileges */
+ if (msk && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *delegate_msk |= msk;
+ break;
+ }
+ default:
+ /* ignore unknown mount options */
+ break;
}
return 0;
@@ -784,10 +1009,14 @@ out:
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = sb->s_fs_info;
struct inode *inode;
int ret;
+ /* Mounting an instance of BPF FS requires privileges */
+ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -811,7 +1040,7 @@ static int bpf_get_tree(struct fs_context *fc)
static void bpf_free_fc(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ kfree(fc->s_fs_info);
}
static const struct fs_context_operations bpf_context_ops = {
@@ -835,17 +1064,32 @@ static int bpf_init_fs_context(struct fs_context *fc)
opts->uid = current_fsuid();
opts->gid = current_fsgid();
- fc->fs_private = opts;
+ /* start out with no BPF token delegation enabled */
+ opts->delegate_cmds = 0;
+ opts->delegate_maps = 0;
+ opts->delegate_progs = 0;
+ opts->delegate_attachs = 0;
+
+ fc->s_fs_info = opts;
fc->ops = &bpf_context_ops;
return 0;
}
+static void bpf_kill_super(struct super_block *sb)
+{
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+
+ kill_litter_super(sb);
+ kfree(opts);
+}
+
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
.init_fs_context = bpf_init_fs_context,
.parameters = bpf_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = bpf_kill_super,
+ .fs_flags = FS_USERNS_MOUNT,
};
static int __init bpf_init(void)
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 594a234f122b..2a243cf37c60 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -9,6 +9,7 @@
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/math64.h>
+#include <linux/string.h>
#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
@@ -333,7 +334,8 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
{
const struct bpf_line_info *linfo;
const struct bpf_prog *prog;
- u32 i, nr_linfo;
+ u32 nr_linfo;
+ int l, r, m;
prog = env->prog;
nr_linfo = prog->aux->nr_linfo;
@@ -342,11 +344,30 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return NULL;
linfo = prog->aux->linfo;
- for (i = 1; i < nr_linfo; i++)
- if (insn_off < linfo[i].insn_off)
- break;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
- return &linfo[i - 1];
+ return &linfo[l];
}
static const char *ltrim(const char *s)
@@ -361,13 +382,28 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
u32 insn_off,
const char *prefix_fmt, ...)
{
- const struct bpf_line_info *linfo;
+ const struct bpf_line_info *linfo, *prev_linfo;
+ const struct btf *btf;
+ const char *s, *fname;
if (!bpf_verifier_log_needed(&env->log))
return;
+ prev_linfo = env->prev_linfo;
linfo = find_linfo(env, insn_off);
- if (!linfo || linfo == env->prev_linfo)
+ if (!linfo || linfo == prev_linfo)
+ return;
+
+ /* It often happens that two separate linfo records point to the same
+ * source code line, but have differing column numbers. Given verifier
+ * log doesn't emit column information, from user perspective we just
+ * end up emitting the same source code line twice unnecessarily.
+ * So instead check that previous and current linfo record point to
+ * the same file (file_name_offs match) and the same line number, and
+ * avoid emitting duplicated source code line in such case.
+ */
+ if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
+ BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
return;
if (prefix_fmt) {
@@ -378,9 +414,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
va_end(args);
}
- verbose(env, "%s\n",
- ltrim(btf_name_by_offset(env->prog->aux->btf,
- linfo->line_off)));
+ btf = env->prog->aux->btf;
+ s = ltrim(btf_name_by_offset(btf, linfo->line_off));
+ verbose(env, "%s", s); /* source code line */
+
+ s = btf_name_by_offset(btf, linfo->file_name_off);
+ /* leave only file name */
+ fname = strrchr(s, '/');
+ fname = fname ? fname + 1 : s;
+ verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));
env->prev_linfo = linfo;
}
@@ -416,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_MEM] = "mem",
+ [PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
@@ -651,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
}
verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_ARENA)
+ return;
if (t == PTR_TO_STACK) {
if (state->frameno != reg->frameno)
verbose(env, "[%d]", reg->frameno);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b32be680da6c..050fe1ebf0f7 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -164,13 +164,13 @@ static inline int extract_bit(const u8 *data, size_t index)
*/
static size_t longest_prefix_match(const struct lpm_trie *trie,
const struct lpm_trie_node *node,
- const struct bpf_lpm_trie_key *key)
+ const struct bpf_lpm_trie_key_u8 *key)
{
u32 limit = min(node->prefixlen, key->prefixlen);
u32 prefixlen = 0, i = 0;
BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
- BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
@@ -229,7 +229,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *found = NULL;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
if (key->prefixlen > trie->max_prefixlen)
return NULL;
@@ -309,7 +309,7 @@ static long trie_update_elem(struct bpf_map *map,
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
struct lpm_trie_node __rcu **slot;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
unsigned long irq_flags;
unsigned int next_bit;
size_t matchlen = 0;
@@ -437,7 +437,7 @@ out:
static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
struct lpm_trie_node __rcu **trim, **trim2;
struct lpm_trie_node *node, *parent;
unsigned long irq_flags;
@@ -536,7 +536,7 @@ out:
sizeof(struct lpm_trie_node))
#define LPM_VAL_SIZE_MIN 1
-#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
@@ -565,7 +565,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* copy mandatory map attributes */
bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
- offsetof(struct bpf_lpm_trie_key, data);
+ offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
spin_lock_init(&trie->lock);
@@ -616,7 +616,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
{
struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+ struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
@@ -703,7 +703,7 @@ find_leftmost:
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
- memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
next_node->data, trie->data_size);
free_stack:
kfree(node_stack);
@@ -715,7 +715,7 @@ static int trie_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- /* Keys must have struct bpf_lpm_trie_key embedded. */
+ /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
-EINVAL : 0;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6abd7c5df4b3..9575314f40a6 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
__bpf_kfunc_end_defs();
-BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index dff7ba539701..c99f8e5234ac 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
} else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
- /* hash table size must be power of 2 */
- n_buckets = roundup_pow_of_two(attr->max_entries);
- if (!n_buckets)
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ if (attr->max_entries > 1UL << 31)
return ERR_PTR(-E2BIG);
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1f18681721c..ae2ff73bde7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_ARENA ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
@@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+ unsigned long nr_pages, struct page **pages)
+{
+ unsigned long i, j;
+ struct page *pg;
+ int ret = 0;
+#ifdef CONFIG_MEMCG_KMEM
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+#endif
+ for (i = 0; i < nr_pages; i++) {
+ pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+
+ if (pg) {
+ pages[i] = pg;
+ continue;
+ }
+ for (j = 0; j < i; j++)
+ __free_page(pages[j]);
+ ret = -ENOMEM;
+ break;
+ }
+
+#ifdef CONFIG_MEMCG_KMEM
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+#endif
+ return ret;
+}
+
+
static int btf_field_cmp(const void *a, const void *b)
{
const struct btf_field *f1 = a, *f2 = b;
@@ -937,6 +971,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
return EPOLLERR;
}
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_get_unmapped_area)
+ return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+#else
+ return addr;
+#endif
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -946,6 +995,7 @@ const struct file_operations bpf_map_fops = {
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
.poll = bpf_map_poll,
+ .get_unmapped_area = bpf_get_unmapped_area,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1011,8 +1061,8 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
- u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+ const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
@@ -1040,7 +1090,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!IS_ERR_OR_NULL(map->record)) {
int i;
- if (!bpf_capable()) {
+ if (!bpf_token_capable(token, CAP_BPF)) {
ret = -EPERM;
goto free_map_tab;
}
@@ -1123,14 +1173,21 @@ free_map_tab:
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+static bool bpf_net_capable(void)
+{
+ return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
+ struct bpf_token *token = NULL;
int numa_node = bpf_map_attr_numa_node(attr);
u32 map_type = attr->map_type;
struct bpf_map *map;
+ bool token_flag;
int f_flags;
int err;
@@ -1138,6 +1195,12 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
+ * to avoid per-map type checks tripping on unknown flag
+ */
+ token_flag = attr->map_flags & BPF_F_TOKEN_FD;
+ attr->map_flags &= ~BPF_F_TOKEN_FD;
+
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
@@ -1147,6 +1210,7 @@ static int map_create(union bpf_attr *attr)
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_type != BPF_MAP_TYPE_ARENA &&
attr->map_extra != 0)
return -EINVAL;
@@ -1178,14 +1242,32 @@ static int map_create(union bpf_attr *attr)
if (!ops->map_mem_usage)
return -EINVAL;
+ if (token_flag) {
+ token = bpf_token_get_from_fd(attr->map_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+
+ /* if current token doesn't grant map creation permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+ !bpf_token_allow_map_type(token, attr->map_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ err = -EPERM;
+
/* Intent here is for unprivileged_bpf_disabled to block BPF map
* creation for unprivileged users; other actions depend
* on fd availability and access to bpffs, so are dependent on
* object creation success. Even with unprivileged BPF disabled,
* capability checks are still carried out.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+ goto put_token;
/* check privileged map type permissions */
switch (map_type) {
@@ -1218,25 +1300,28 @@ static int map_create(union bpf_attr *attr)
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
- if (!bpf_capable())
- return -EPERM;
+ case BPF_MAP_TYPE_ARENA:
+ if (!bpf_token_capable(token, CAP_BPF))
+ goto put_token;
break;
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
case BPF_MAP_TYPE_XSKMAP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ if (!bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
break;
default:
WARN(1, "unsupported map type %d", map_type);
- return -EPERM;
+ goto put_token;
}
map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto put_token;
+ }
map->ops = ops;
map->map_type = map_type;
@@ -1273,7 +1358,7 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
if (attr->btf_value_type_id) {
- err = map_check_btf(map, btf, attr->btf_key_type_id,
+ err = map_check_btf(map, token, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
@@ -1285,15 +1370,16 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_alloc(map);
+ err = security_bpf_map_create(map, attr, token);
if (err)
- goto free_map;
+ goto free_map_sec;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
+ bpf_token_put(token);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -1314,6 +1400,8 @@ free_map_sec:
free_map:
btf_put(map->btf);
map->ops->map_free(map);
+put_token:
+ bpf_token_put(token);
return err;
}
@@ -2144,7 +2232,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
- security_bpf_prog_free(aux);
+ security_bpf_prog_free(aux->prog);
bpf_prog_free(aux->prog);
}
@@ -2160,7 +2248,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->attach_btf);
if (deferred) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -2590,13 +2678,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
+ struct bpf_token *token = NULL;
+ bool bpf_cap;
int err;
char license[128];
@@ -2610,13 +2700,35 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
BPF_F_TEST_RND_HI32 |
BPF_F_XDP_HAS_FRAGS |
BPF_F_XDP_DEV_BOUND_ONLY |
- BPF_F_TEST_REG_INVARIANTS))
+ BPF_F_TEST_REG_INVARIANTS |
+ BPF_F_TOKEN_FD))
return -EINVAL;
+ bpf_prog_load_fixup_attach_type(attr);
+
+ if (attr->prog_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->prog_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant prog loading permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+ !bpf_token_allow_prog_type(token, attr->prog_type,
+ attr->expected_attach_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ bpf_cap = bpf_token_capable(token, CAP_BPF);
+ err = -EPERM;
+
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
/* Intent here is for unprivileged_bpf_disabled to block BPF program
* creation for unprivileged users; other actions depend
@@ -2625,21 +2737,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
* capability checks are still carried out for these
* and other operations.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+ goto put_token;
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
- return -E2BIG;
+ attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+ err = -E2BIG;
+ goto put_token;
+ }
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
- if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (is_perfmon_prog_type(type) && !perfmon_capable())
- return -EPERM;
+ if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+ goto put_token;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
@@ -2649,27 +2763,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
- if (IS_ERR(attach_btf))
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = -EINVAL;
+ goto put_token;
+ }
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
- return -ENOTSUPP;
+ err = -ENOTSUPP;
+ goto put_token;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(attach_btf))
- return PTR_ERR(attach_btf);
- if (!attach_btf)
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = PTR_ERR(attach_btf);
+ goto put_token;
+ }
+ if (!attach_btf) {
+ err = -EINVAL;
+ goto put_token;
+ }
btf_get(attach_btf);
}
- bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
@@ -2677,7 +2797,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -EINVAL;
+ err = -EINVAL;
+ goto put_token;
}
/* plain bpf_prog allocation */
@@ -2687,20 +2808,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -ENOMEM;
+ err = -EINVAL;
+ goto put_token;
}
prog->expected_attach_type = attr->expected_attach_type;
+ prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
prog->aux->dev_bound = !!attr->prog_ifindex;
- prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
- err = security_bpf_prog_alloc(prog->aux);
- if (err)
- goto free_prog;
+ /* move token into prog->aux, reuse taken refcnt */
+ prog->aux->token = token;
+ token = NULL;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
@@ -2709,12 +2831,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (copy_from_bpfptr(prog->insns,
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
- goto free_prog_sec;
+ goto free_prog;
/* copy eBPF program license from user space */
if (strncpy_from_bpfptr(license,
make_bpfptr(attr->license, uattr.is_kernel),
sizeof(license) - 1) < 0)
- goto free_prog_sec;
+ goto free_prog;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2728,14 +2850,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_dev_bound_init(prog, attr);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
if (type == BPF_PROG_TYPE_EXT && dst_prog &&
bpf_prog_is_dev_bound(dst_prog->aux)) {
err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
/*
@@ -2757,12 +2879,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
- goto free_prog_sec;
+ goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
+ goto free_prog;
+
+ err = security_bpf_prog_load(prog, attr, token);
+ if (err)
goto free_prog_sec;
/* run eBPF verifier */
@@ -2808,13 +2934,16 @@ free_used_maps:
*/
__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
+
free_prog_sec:
- free_uid(prog->aux->user);
- security_bpf_prog_free(prog->aux);
+ security_bpf_prog_free(prog);
free_prog:
+ free_uid(prog->aux->user);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
+put_token:
+ bpf_token_put(token);
return err;
}
@@ -3501,6 +3630,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
if (!kallsyms_show_value(current_cred()))
addr = 0;
info->perf_event.kprobe.addr = addr;
+ info->perf_event.kprobe.cookie = event->bpf_cookie;
return 0;
}
#endif
@@ -3526,6 +3656,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
else
info->perf_event.type = BPF_PERF_EVENT_UPROBE;
info->perf_event.uprobe.offset = offset;
+ info->perf_event.uprobe.cookie = event->bpf_cookie;
return 0;
}
#endif
@@ -3553,6 +3684,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
ulen = info->perf_event.tracepoint.name_len;
info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ info->perf_event.tracepoint.cookie = event->bpf_cookie;
return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
}
@@ -3561,6 +3693,7 @@ static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
{
info->perf_event.event.type = event->attr.type;
info->perf_event.event.config = event->attr.config;
+ info->perf_event.event.cookie = event->bpf_cookie;
info->perf_event.type = BPF_PERF_EVENT_EVENT;
return 0;
}
@@ -3818,7 +3951,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
/* cg-skb progs can be loaded by unpriv user.
* check permissions at attach time.
*/
@@ -4021,7 +4154,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_net_capable())
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
return -EINVAL;
@@ -4320,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
continue;
}
+ if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+ BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+ insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
+
if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
@@ -4687,6 +4826,8 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+ bpf_map_struct_ops_info_fill(&info, map);
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -4789,15 +4930,34 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!bpf_capable())
+ if (attr->btf_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->btf_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->btf_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_new_fd(attr, uattr, uattr_size);
}
@@ -5394,7 +5554,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
/* The bpf program will not access the bpf map, but for the sake of
* simplicity, increase sleepable_refcnt for sleepable program as well.
*/
- if (prog->aux->sleepable)
+ if (prog->sleepable)
atomic64_inc(&map->sleepable_refcnt);
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
@@ -5415,6 +5575,20 @@ out_prog_put:
return ret;
}
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_TOKEN_CREATE))
+ return -EINVAL;
+
+ /* no flags are supported yet */
+ if (attr->token_create.flags)
+ return -EINVAL;
+
+ return bpf_token_create(attr);
+}
+
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -5548,6 +5722,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_BIND_MAP:
err = bpf_prog_bind_map(&attr);
break;
+ case BPF_TOKEN_CREATE:
+ err = token_create(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -5654,7 +5831,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5704,7 +5881,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+ return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+ ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..d6ccf8d00eab
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,278 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+#include <linux/security.h>
+
+static bool bpf_ns_capable(struct user_namespace *ns, int cap)
+{
+ return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
+}
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+ struct user_namespace *userns;
+
+ /* BPF token allows ns_capable() level of capabilities */
+ userns = token ? token->userns : &init_user_ns;
+ if (!bpf_ns_capable(userns, cap))
+ return false;
+ if (token && security_bpf_token_capable(token, cap) < 0)
+ return false;
+ return true;
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+ atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+ security_bpf_token_free(token);
+ put_user_ns(token->userns);
+ kfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+ struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+ bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+ if (!token)
+ return;
+
+ if (!atomic64_dec_and_test(&token->refcnt))
+ return;
+
+ INIT_WORK(&token->work, bpf_token_put_deferred);
+ schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+
+ bpf_token_put(token);
+ return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+ u64 mask;
+
+ BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+ mask = BIT_ULL(__MAX_BPF_CMD) - 1;
+ if ((token->allowed_cmds & mask) == mask)
+ seq_printf(m, "allowed_cmds:\tany\n");
+ else
+ seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+ BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
+ if ((token->allowed_maps & mask) == mask)
+ seq_printf(m, "allowed_maps:\tany\n");
+ else
+ seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+ BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
+ if ((token->allowed_progs & mask) == mask)
+ seq_printf(m, "allowed_progs:\tany\n");
+ else
+ seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+ BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
+ if ((token->allowed_attachs & mask) == mask)
+ seq_printf(m, "allowed_attachs:\tany\n");
+ else
+ seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+ .release = bpf_token_release,
+ .show_fdinfo = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+ struct bpf_mount_opts *mnt_opts;
+ struct bpf_token *token = NULL;
+ struct user_namespace *userns;
+ struct inode *inode;
+ struct file *file;
+ struct path path;
+ struct fd f;
+ umode_t mode;
+ int err, fd;
+
+ f = fdget(attr->token_create.bpffs_fd);
+ if (!f.file)
+ return -EBADF;
+
+ path = f.file->f_path;
+ path_get(&path);
+ fdput(f);
+
+ if (path.dentry != path.mnt->mnt_sb->s_root) {
+ err = -EINVAL;
+ goto out_path;
+ }
+ if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
+ err = -EINVAL;
+ goto out_path;
+ }
+ err = path_permission(&path, MAY_ACCESS);
+ if (err)
+ goto out_path;
+
+ userns = path.dentry->d_sb->s_user_ns;
+ /*
+ * Enforce that creators of BPF tokens are in the same user
+ * namespace as the BPF FS instance. This makes reasoning about
+ * permissions a lot easier and we can always relax this later.
+ */
+ if (current_user_ns() != userns) {
+ err = -EPERM;
+ goto out_path;
+ }
+ if (!ns_capable(userns, CAP_BPF)) {
+ err = -EPERM;
+ goto out_path;
+ }
+
+ /* Creating BPF token in init_user_ns doesn't make much sense. */
+ if (current_user_ns() == &init_user_ns) {
+ err = -EOPNOTSUPP;
+ goto out_path;
+ }
+
+ mnt_opts = path.dentry->d_sb->s_fs_info;
+ if (mnt_opts->delegate_cmds == 0 &&
+ mnt_opts->delegate_maps == 0 &&
+ mnt_opts->delegate_progs == 0 &&
+ mnt_opts->delegate_attachs == 0) {
+ err = -ENOENT; /* no BPF token delegation is set up */
+ goto out_path;
+ }
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_path;
+ }
+
+ inode->i_op = &bpf_token_iops;
+ inode->i_fop = &bpf_token_fops;
+ clear_nlink(inode); /* make sure it is unlinked */
+
+ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+ if (IS_ERR(file)) {
+ iput(inode);
+ err = PTR_ERR(file);
+ goto out_path;
+ }
+
+ token = kzalloc(sizeof(*token), GFP_USER);
+ if (!token) {
+ err = -ENOMEM;
+ goto out_file;
+ }
+
+ atomic64_set(&token->refcnt, 1);
+
+ /* remember bpffs owning userns for future ns_capable() checks */
+ token->userns = get_user_ns(userns);
+
+ token->allowed_cmds = mnt_opts->delegate_cmds;
+ token->allowed_maps = mnt_opts->delegate_maps;
+ token->allowed_progs = mnt_opts->delegate_progs;
+ token->allowed_attachs = mnt_opts->delegate_attachs;
+
+ err = security_bpf_token_create(token, attr, &path);
+ if (err)
+ goto out_token;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ err = fd;
+ goto out_token;
+ }
+
+ file->private_data = token;
+ fd_install(fd, file);
+
+ path_put(&path);
+ return fd;
+
+out_token:
+ bpf_token_free(token);
+out_file:
+ fput(file);
+out_path:
+ path_put(&path);
+ return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_token *token;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+ if (f.file->f_op != &bpf_token_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ token = f.file->private_data;
+ bpf_token_inc(token);
+ fdput(f);
+
+ return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+ if (!token)
+ return false;
+ if (!(token->allowed_cmds & BIT_ULL(cmd)))
+ return false;
+ return security_bpf_token_cmd(token, cmd) == 0;
+}
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+ if (!token || type >= __MAX_BPF_MAP_TYPE)
+ return false;
+
+ return token->allowed_maps & BIT_ULL(type);
+}
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+ return false;
+
+ return (token->allowed_progs & BIT_ULL(prog_type)) &&
+ (token->allowed_attachs & BIT_ULL(attach_type));
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d382f5ebe06c..db7599c59c78 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1014,7 +1014,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
{
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
if (bpf_prog_check_recur(prog))
return sleepable ? __bpf_prog_enter_sleepable_recur :
@@ -1029,7 +1029,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
{
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
if (bpf_prog_check_recur(prog))
return sleepable ? __bpf_prog_exit_sleepable_recur :
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ddea9567f755..63749ad5ac6b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -528,6 +528,21 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
(bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}
+static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+{
+ return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
+}
+
+static bool is_may_goto_insn(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -1155,6 +1170,12 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
stack->spilled_ptr.type == SCALAR_VALUE;
}
+static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[0] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
* case they are equivalent, or it's STACK_ZERO, in which case we preserve
* more precise STACK_ZERO.
@@ -1418,6 +1439,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
+ dst_state->may_goto_depth = src->may_goto_depth;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -2264,8 +2286,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
/*
* Clear type, off, and union(map_ptr, range) and
@@ -2277,10 +2298,20 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = !env->bpf_capable;
+ reg->precise = false;
__mark_reg_unbounded(reg);
}
+/* Mark a register as having a completely unknown (scalar) value,
+ * initialize .precise as true when not bpf capable.
+ */
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown_imprecise(reg);
+ reg->precise = !env->bpf_capable;
+}
+
static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
@@ -4355,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
+ case PTR_TO_ARENA:
return true;
default:
return false;
@@ -4380,20 +4412,6 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}
-static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
-{
- return tnum_is_unknown(reg->var_off) &&
- reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
- reg->umin_value == 0 && reg->umax_value == U64_MAX &&
- reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
- reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
-}
-
-static bool register_is_bounded(struct bpf_reg_state *reg)
-{
- return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
-}
-
static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg)
{
@@ -4403,6 +4421,18 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
+ struct bpf_reg_state *src_reg)
+{
+ if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
+ !tnum_is_const(src_reg->var_off))
+ /* Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by find_equal_scalars() to
+ * propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+}
+
/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
@@ -4438,6 +4468,11 @@ static bool is_bpf_st_mem(struct bpf_insn *insn)
return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
}
+static int get_reg_width(struct bpf_reg_state *reg)
+{
+ return fls64(reg->umax_value);
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -4487,13 +4522,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
mark_stack_slot_scratched(env, spi);
- if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
+ if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
+ bool reg_value_fits;
+
+ reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
+ /* Make sure that reg had an ID to build a relation on spill. */
+ if (reg_value_fits)
+ assign_scalar_id_before_mov(env, reg);
save_register_state(env, state, spi, reg, size);
/* Break the relation on a narrowing spill. */
- if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
+ if (!reg_value_fits)
state->stack[spi].spilled_ptr.id = 0;
} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
- insn->imm != 0 && env->bpf_capable) {
+ env->bpf_capable) {
struct bpf_reg_state fake_reg = {};
__mark_reg_known(&fake_reg, insn->imm);
@@ -4640,7 +4681,20 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* Erase all spilled pointers. */
+ /* If writing_zero and the spi slot contains a spill of value 0,
+ * maintain the spill type.
+ */
+ if (writing_zero && *stype == STACK_SPILL &&
+ is_spilled_scalar_reg(&state->stack[spi])) {
+ struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
+
+ if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
+ zero_used = true;
+ continue;
+ }
+ }
+
+ /* Erase all other spilled pointers. */
state->stack[spi].spilled_ptr.type = NOT_INIT;
/* Update the slot type. */
@@ -4756,7 +4810,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno < 0)
return 0;
- if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ if (size <= spill_size &&
+ bpf_stack_narrow_access_ok(off, size, spill_size)) {
/* The earlier check_reg_arg() has decided the
* subreg_def for this insn. Save it first.
*/
@@ -4764,6 +4819,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
+
+ /* Break the relation on a narrowing fill.
+ * coerce_reg_to_size will adjust the boundaries.
+ */
+ if (get_reg_width(reg) > size * BITS_PER_BYTE)
+ state->regs[dst_regno].id = 0;
} else {
int spill_cnt = 0, zero_cnt = 0;
@@ -5211,6 +5272,11 @@ bad_type:
return -EINVAL;
}
+static bool in_sleepable(struct bpf_verifier_env *env)
+{
+ return env->prog->sleepable;
+}
+
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
* can dereference RCU protected pointers and result is PTR_TRUSTED.
*/
@@ -5218,7 +5284,7 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock ||
env->cur_state->active_lock.ptr ||
- !env->prog->aux->sleepable;
+ !in_sleepable(env);
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
@@ -5763,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_XDP_SOCK:
pointer_desc = "xdp_sock ";
break;
+ case PTR_TO_ARENA:
+ return 0;
default:
break;
}
@@ -5770,6 +5838,17 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
+{
+ if (env->prog->jit_requested)
+ return round_up(stack_depth, 16);
+
+ /* round up to 32-bytes, since this is granularity
+ * of interpreter stack size
+ */
+ return round_up(max_t(u32, stack_depth, 1), 32);
+}
+
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
@@ -5813,10 +5892,7 @@ process_func:
depth);
return -EACCES;
}
- /* round up to 32-bytes, since this is granularity
- * of interpreter stack size
- */
- depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
+ depth += round_up_stack_depth(env, subprog[idx].stack_depth);
if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
frame + 1, depth);
@@ -5910,7 +5986,7 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -6041,10 +6117,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
* values are also truncated so we push 64-bit bounds into
* 32-bit bounds. Above were truncated < 32-bits already.
*/
- if (size < 4) {
+ if (size < 4)
__mark_reg32_unbounded(reg);
- reg_bounds_sync(reg);
- }
+
+ reg_bounds_sync(reg);
}
static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
@@ -6864,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_ARENA) {
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str(env, reg->type));
@@ -8200,6 +8279,7 @@ found:
switch ((int)reg->type) {
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
@@ -8334,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_ARENA:
case SCALAR_VALUE:
return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -9298,10 +9379,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
return -EINVAL;
}
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /*
+ * Can pass any value and the kernel won't crash, but
+ * only PTR_TO_ARENA or SCALAR make sense. Everything
+ * else is a bug in the bpf program. Point it out to
+ * the user at the verification time instead of
+ * run-time debug nightmare.
+ */
+ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+ return -EINVAL;
+ }
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
return ret;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ continue;
+
+ memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+ err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
+ err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ if (err)
+ return err;
} else {
bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
i, arg->arg_type);
@@ -9377,9 +9482,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
return -EFAULT;
}
- if (insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_timer_set_callback) {
+ if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer callbacks are async */
@@ -9438,6 +9541,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
+ /* Only global subprogs cannot be called with a lock held. */
+ if (env->cur_state->active_lock.ptr) {
+ verbose(env, "global function calls are not allowed while holding a lock,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name);
@@ -10094,7 +10204,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (!env->prog->aux->sleepable && fn->might_sleep) {
+ if (!in_sleepable(env) && fn->might_sleep) {
verbose(env, "helper call might sleep in a non-sleepable prog\n");
return -EINVAL;
}
@@ -10124,7 +10234,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ if (in_sleepable(env) && is_storage_get_function(func_id))
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
@@ -10620,24 +10730,6 @@ static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU_PROTECTED;
}
-static bool __kfunc_param_match_suffix(const struct btf *btf,
- const struct btf_param *arg,
- const char *suffix)
-{
- int suffix_len = strlen(suffix), len;
- const char *param_name;
-
- /* In the future, this can be ported to use BTF tagging */
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len < suffix_len)
- return false;
- param_name += len - suffix_len;
- return !strncmp(param_name, suffix, suffix_len);
-}
-
static bool is_kfunc_arg_mem_size(const struct btf *btf,
const struct btf_param *arg,
const struct bpf_reg_state *reg)
@@ -10648,7 +10740,7 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
return false;
- return __kfunc_param_match_suffix(btf, arg, "__sz");
+ return btf_param_match_suffix(btf, arg, "__sz");
}
static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
@@ -10661,47 +10753,52 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
return false;
- return __kfunc_param_match_suffix(btf, arg, "__szk");
+ return btf_param_match_suffix(btf, arg, "__szk");
}
static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__opt");
+ return btf_param_match_suffix(btf, arg, "__opt");
}
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__k");
+ return btf_param_match_suffix(btf, arg, "__k");
}
static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__ign");
+ return btf_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__map");
}
static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__alloc");
+ return btf_param_match_suffix(btf, arg, "__alloc");
}
static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__uninit");
+ return btf_param_match_suffix(btf, arg, "__uninit");
}
static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+ return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
}
static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__nullable");
+ return btf_param_match_suffix(btf, arg, "__nullable");
}
static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__str");
+ return btf_param_match_suffix(btf, arg, "__str");
}
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
@@ -10848,6 +10945,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_RB_NODE,
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
+ KF_ARG_PTR_TO_MAP,
};
enum special_kfunc_type {
@@ -10971,7 +11069,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
* type to our caller. When a set of conditions hold in the BTF type of
* arguments, we resolve it to a known kfunc_ptr_arg_type.
*/
- if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
@@ -11001,6 +11099,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CONST_STR;
+ if (is_kfunc_arg_map(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_MAP;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11483,7 +11584,7 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
return true;
fallthrough;
default:
- return env->prog->aux->sleepable;
+ return in_sleepable(env);
}
}
@@ -11601,6 +11702,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
switch (kf_arg_type) {
case KF_ARG_PTR_TO_NULL:
continue;
+ case KF_ARG_PTR_TO_MAP:
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11817,6 +11919,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_MAP:
+ /* If argument has '__map' suffix expect 'struct bpf_map *' */
+ ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
+ ref_t = btf_type_by_id(btf_vmlinux, ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ fallthrough;
case KF_ARG_PTR_TO_BTF_ID:
/* Only base_type is checked, further checks are done here */
if ((base_type(reg->type) != PTR_TO_BTF_ID ||
@@ -12004,7 +12112,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
sleepable = is_kfunc_sleepable(&meta);
- if (sleepable && !env->prog->aux->sleepable) {
+ if (sleepable && !in_sleepable(env)) {
verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
}
@@ -12291,6 +12399,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.func_name);
return -EFAULT;
}
+ } else if (btf_type_is_void(ptr_type)) {
+ /* kfunc returning 'void *' is equivalent to returning scalar */
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
__u32 sz;
@@ -12828,6 +12939,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
switch (base_type(ptr_reg->type)) {
+ case PTR_TO_CTX:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ case PTR_TO_TP_BUFFER:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_FUNC:
+ case CONST_PTR_TO_DYNPTR:
+ break;
case PTR_TO_FLOW_KEYS:
if (known)
break;
@@ -12837,16 +12961,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (known && smin_val == 0 && opcode == BPF_ADD)
break;
fallthrough;
- case PTR_TO_PACKET_END:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
+ default:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
- default:
- break;
}
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
@@ -13753,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
+
+ if (dst_reg->type == PTR_TO_ARENA) {
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /*
+ * 32-bit operations zero upper bits automatically.
+ * 64-bit operations need to be converted to 32.
+ */
+ aux->needs_zext = true;
+
+ /* Any arithmetic operations are allowed on arena pointers */
+ return 0;
+ }
+
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
else
@@ -13870,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
-
if (BPF_CLASS(insn->code) == BPF_ALU) {
- if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
} else {
- if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
- insn->off != 32) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) || insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -13907,20 +14041,18 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = regs + insn->src_reg;
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
- bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
- !tnum_is_const(src_reg->var_off);
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- if (insn->off == 0) {
+ if (insn->imm) {
+ /* off == BPF_ADDR_SPACE_CAST */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ if (insn->imm == 1) /* cast from as(1) to as(0) */
+ dst_reg->type = PTR_TO_ARENA;
+ } else if (insn->off == 0) {
/* case: R1 = R2
* copy register state to dest reg
*/
- if (need_id)
- /* Assign src and dst registers the same ID
- * that will be used by find_equal_scalars()
- * to propagate min/max range.
- */
- src_reg->id = ++env->id_gen;
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
@@ -13935,8 +14067,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
bool no_sext;
no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
- if (no_sext && need_id)
- src_reg->id = ++env->id_gen;
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
@@ -13956,10 +14088,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
if (insn->off == 0) {
- bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+ bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;
- if (is_src_reg_u32 && need_id)
- src_reg->id = ++env->id_gen;
+ if (is_src_reg_u32)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared if src_reg is not in u32
* range otherwise dst_reg min/max could be incorrectly
@@ -13973,8 +14105,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: W1 = (s8, s16)W2 */
bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
- if (no_sext && need_id)
- src_reg->id = ++env->id_gen;
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
@@ -14809,11 +14941,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
int err;
/* Only conditional jumps are expected to reach here. */
- if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ if (opcode == BPF_JA || opcode > BPF_JCOND) {
verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
+ if (opcode == BPF_JCOND) {
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ int idx = *insn_idx;
+
+ if (insn->code != (BPF_JMP | BPF_JCOND) ||
+ insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm || insn->off == 0) {
+ verbose(env, "invalid may_goto off %d imm %d\n",
+ insn->off, insn->imm);
+ return -EINVAL;
+ }
+ prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+ /* branch out 'fallthrough' insn as a new state to explore */
+ queued_st = push_stack(env, idx + 1, idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_st->may_goto_depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+ *insn_idx += insn->off;
+ return 0;
+ }
+
/* check src2 operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
@@ -15065,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
WARN_ON_ONCE(map->max_entries != 1);
@@ -15531,7 +15692,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
- if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
+ if (is_async_callback_calling_insn(insn))
/* Mark this call insn as a prune point to trigger
* is_state_visited() check before call itself is
* processed by __check_func_call(). Otherwise new
@@ -15597,6 +15758,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
default:
/* conditional jump with two edges */
mark_prune_point(env, t);
+ if (is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret)
@@ -16160,8 +16323,8 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
/* check %cur's range satisfies %old's */
-static bool range_within(struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
{
return old->umin_value <= cur->umin_value &&
old->umax_value >= cur->umax_value &&
@@ -16325,21 +16488,28 @@ static bool regs_exact(const struct bpf_reg_state *rold,
check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
{
- if (exact)
+ if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (!(rold->live & REG_LIVE_READ))
+ if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
/* explored state didn't use this */
return true;
- if (rold->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
- if (rcur->type == NOT_INIT)
- return false;
+ if (rold->type == NOT_INIT) {
+ if (exact == NOT_EXACT || rcur->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ }
/* Enforce that register types have to match exactly, including their
* modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
@@ -16374,7 +16544,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_scalar_ids(rold->id, rcur->id, idmap);
}
- if (!rold->precise)
+ if (!rold->precise && exact == NOT_EXACT)
return true;
/* Why check_ids() for scalar registers?
*
@@ -16442,13 +16612,53 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* the same stack frame, since fp-8 in foo != fp-8 in bar
*/
return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
default:
return regs_exact(rold, rcur, idmap);
}
}
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ __mark_reg_unknown_imprecise(&unbound_reg);
+ unbound_reg.live |= REG_LIVE_READ;
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_stack_all_misc(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ if (is_spilled_scalar_reg64(stack))
+ return &stack->spilled_ptr;
+
+ if (is_stack_all_misc(env, stack))
+ return &unbound_reg;
+
+ return NULL;
+}
+
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
{
int i, spi;
@@ -16461,12 +16671,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (exact &&
+ if (exact != NOT_EXACT &&
old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
return false;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
+ && exact == NOT_EXACT) {
i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
@@ -16485,6 +16696,20 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
if (i >= cur->allocated_stack)
return false;
+ /* 64-bit scalar spill vs all slots MISC and vice versa.
+ * Load from all slots MISC produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += BPF_REG_SIZE - 1;
+ continue;
+ }
+
/* if old state was safe with misc data in the stack
* it will be safe with zero-initialized stack.
* The opposite is not true
@@ -16598,7 +16823,7 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, bool exact)
+ struct bpf_func_state *cur, enum exact_level exact)
{
int i;
@@ -16628,7 +16853,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env)
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur,
- bool exact)
+ enum exact_level exact)
{
int i;
@@ -17002,7 +17227,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* => unsafe memory access at 11 would not be caught.
*/
if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, true)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
struct bpf_func_state *cur_frame;
struct bpf_reg_state *iter_state, *iter_reg;
int spi;
@@ -17025,15 +17250,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
goto skip_inf_loop_check;
}
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ update_loop_entry(cur, &sl->state);
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
if (calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, true))
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
goto hit;
goto skip_inf_loop_check;
}
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur, false) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
!iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -17089,7 +17322,7 @@ skip_inf_loop_check:
*/
loop_entry = get_loop_entry(&sl->state);
force_exact = loop_entry && loop_entry->branches > 0;
- if (states_equal(env, &sl->state, cur, force_exact)) {
+ if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
if (force_exact)
update_loop_entry(cur, loop_entry);
hit:
@@ -17259,6 +17492,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_ARENA:
return false;
default:
return true;
@@ -17541,7 +17775,6 @@ static int do_check(struct bpf_verifier_env *env)
if (env->cur_state->active_lock.ptr) {
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
- (insn->src_reg == BPF_PSEUDO_CALL) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
(insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
verbose(env, "function calls are not allowed while holding a lock\n");
@@ -17589,14 +17822,12 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
process_bpf_exit_full:
- if (env->cur_state->active_lock.ptr &&
- !in_rbtree_lock_required_cb(env)) {
+ if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
- if (env->cur_state->active_rcu_lock &&
- !in_rbtree_lock_required_cb(env)) {
+ if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
verbose(env, "bpf_rcu_read_unlock is missing\n");
return -EINVAL;
}
@@ -17909,7 +18140,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (prog->aux->sleepable)
+ if (prog->sleepable)
switch (map->map_type) {
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_LRU_HASH:
@@ -17925,6 +18156,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_ARENA:
break;
default:
verbose(env,
@@ -18094,7 +18328,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -E2BIG;
}
- if (env->prog->aux->sleepable)
+ if (env->prog->sleepable)
atomic64_inc(&map->sleepable_refcnt);
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
@@ -18112,6 +18346,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
fdput(f);
return -EBUSY;
}
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ fdput(f);
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ fdput(f);
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
+ }
fdput(f);
next_insn:
@@ -18733,6 +18992,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog->aux->num_exentries++;
}
continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
default:
continue;
}
@@ -18918,13 +19185,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
@@ -19299,7 +19572,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
- int i, ret, cnt, delta = 0;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
if (env->seen_exception && !env->exception_callback_subprog) {
struct bpf_insn patch[] = {
@@ -19319,7 +19595,22 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
mark_subprog_exc_cb(env, env->exception_callback_subprog);
}
- for (i = 0; i < insn_cnt; i++, insn++) {
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
/* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
@@ -19358,7 +19649,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
@@ -19378,7 +19669,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Rewrite pointer arithmetic to mitigate speculation attacks. */
@@ -19393,7 +19684,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
aux = &env->insn_aux_data[i + delta];
if (!aux->alu_state ||
aux->alu_state == BPF_ALU_NON_POINTER)
- continue;
+ goto next_insn;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
@@ -19431,19 +19722,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
+ }
+
+ if (is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
if (insn->code != (BPF_JMP | BPF_CALL))
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_CALL)
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
if (cnt == 0)
- continue;
+ goto next_insn;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -19452,7 +19763,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
if (insn->imm == BPF_FUNC_get_route_realm)
@@ -19500,11 +19811,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
insn->imm = ret + 1;
- continue;
+ goto next_insn;
}
if (!bpf_map_ptr_unpriv(aux))
- continue;
+ goto next_insn;
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
@@ -19533,7 +19844,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
if (insn->imm == BPF_FUNC_timer_set_callback) {
@@ -19570,7 +19881,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
if (is_storage_get_function(insn->imm)) {
- if (!env->prog->aux->sleepable ||
+ if (!in_sleepable(env) ||
env->insn_aux_data[i + delta].storage_get_func_atomic)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
else
@@ -19645,7 +19956,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
@@ -19676,31 +19987,31 @@ patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_update_elem:
insn->imm = BPF_CALL_IMM(ops->map_update_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_delete_elem:
insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_push_elem:
insn->imm = BPF_CALL_IMM(ops->map_push_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_pop_elem:
insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_peek_elem:
insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_redirect_map:
insn->imm = BPF_CALL_IMM(ops->map_redirect);
- continue;
+ goto next_insn;
case BPF_FUNC_for_each_map_elem:
insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
- continue;
+ goto next_insn;
case BPF_FUNC_map_lookup_percpu_elem:
insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
- continue;
+ goto next_insn;
}
goto patch_call_imm;
@@ -19728,7 +20039,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_arg inline. */
@@ -19753,7 +20064,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ret inline. */
@@ -19781,7 +20092,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement get_func_arg_cnt inline. */
@@ -19796,7 +20107,7 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ip inline. */
@@ -19811,9 +20122,26 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -19826,6 +20154,40 @@ patch_call_imm:
return -EFAULT;
}
insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+
+ if (!stack_slots)
+ continue;
+ if (stack_slots > 1) {
+ verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+ return -EFAULT;
+ }
+
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+ /* Copy first actual insn to preserve it */
+ insn_buf[1] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = prog = new_prog;
}
/* Since poke tab is now finalized, publish aux to tracker. */
@@ -20046,7 +20408,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
state->first_insn_idx = env->subprog_info[subprog].start;
state->last_insn_idx = -1;
-
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
const char *sub_name = subprog_name(env, subprog);
@@ -20090,6 +20451,21 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
reg->mem_size = arg->mem_size;
reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ reg->type = PTR_TO_BTF_ID;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
+ if (arg->arg_type & PTR_UNTRUSTED)
+ reg->type |= PTR_UNTRUSTED;
+ if (arg->arg_type & PTR_TRUSTED)
+ reg->type |= PTR_TRUSTED;
+ mark_reg_known_zero(env, regs, i);
+ reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
+ reg->btf_id = arg->btf_id;
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /* caller can pass either PTR_TO_ARENA or SCALAR */
+ mark_reg_unknown(env, regs, i);
} else {
WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
i - BPF_REG_1, arg->arg_type);
@@ -20238,10 +20614,12 @@ static void print_verification_stats(struct bpf_verifier_env *env)
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops_desc *st_ops_desc;
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
u32 btf_id, member_idx;
+ struct btf *btf;
const char *mname;
if (!prog->gpl_compatible) {
@@ -20249,15 +20627,30 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ if (!prog->aux->attach_btf_id)
+ return -ENOTSUPP;
+
+ btf = prog->aux->attach_btf;
+ if (btf_is_module(btf)) {
+ /* Make sure st_ops is valid through the lifetime of env */
+ env->attach_btf_mod = btf_try_get_module(btf);
+ if (!env->attach_btf_mod) {
+ verbose(env, "struct_ops module %s is not found\n",
+ btf_get_name(btf));
+ return -ENOTSUPP;
+ }
+ }
+
btf_id = prog->aux->attach_btf_id;
- st_ops = bpf_struct_ops_find(btf_id);
- if (!st_ops) {
+ st_ops_desc = bpf_struct_ops_find(btf, btf_id);
+ if (!st_ops_desc) {
verbose(env, "attach_btf_id %u is not a supported struct\n",
btf_id);
return -ENOTSUPP;
}
+ st_ops = st_ops_desc->st_ops;
- t = st_ops->type;
+ t = st_ops_desc->type;
member_idx = prog->expected_attach_type;
if (member_idx >= btf_type_vlen(t)) {
verbose(env, "attach to invalid member idx %u of struct %s\n",
@@ -20266,8 +20659,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
member = &btf_type_member(t)[member_idx];
- mname = btf_name_by_offset(btf_vmlinux, member->name_off);
- func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ mname = btf_name_by_offset(btf, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf, member->type,
NULL);
if (!func_proto) {
verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
@@ -20285,6 +20678,12 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ /* btf_ctx_access() used this to provide argument type info */
+ prog->aux->ctx_arg_info =
+ st_ops_desc->arg_info[member_idx].info;
+ prog->aux->ctx_arg_info_size =
+ st_ops_desc->arg_info[member_idx].cnt;
+
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
@@ -20542,7 +20941,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
}
- if (prog->aux->sleepable) {
+ if (prog->sleepable) {
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -20653,14 +21052,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
u64 key;
if (prog->type == BPF_PROG_TYPE_SYSCALL) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
/* attach_btf_id checked to be zero already */
return 0;
verbose(env, "Syscall programs can only be sleepable\n");
return -EINVAL;
}
- if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ if (prog->sleepable && !can_be_sleepable(prog)) {
verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -20769,7 +21168,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
- is_priv = bpf_capable();
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+ env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+ env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+ env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+ env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
bpf_get_btf_vmlinux();
@@ -20801,12 +21205,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = bpf_allow_ptr_leaks();
- env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->bypass_spec_v1 = bpf_bypass_spec_v1();
- env->bypass_spec_v4 = bpf_bypass_spec_v4();
- env->bpf_capable = bpf_capable();
-
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
@@ -20972,6 +21370,8 @@ err_release_maps:
env->prog->expected_attach_type = 0;
*prog = env->prog;
+
+ module_put(env->attach_btf_mod);
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index a8350d2d63e6..07e2284bb499 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -562,10 +562,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
-BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
-BTF_SET8_END(bpf_rstat_kfunc_ids)
+BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 4722b998a324..509ee703de15 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -40,6 +40,12 @@ CONFIG_UBSAN_ENUM=y
CONFIG_UBSAN_SHIFT=y
CONFIG_UBSAN_UNREACHABLE=y
#
+# Networking Debugging
+#
+CONFIG_NET_DEV_REFCNT_TRACKER=y
+CONFIG_NET_NS_REFCNT_TRACKER=y
+CONFIG_DEBUG_NET=y
+#
# Memory Debugging
#
# CONFIG_DEBUG_PAGEALLOC is not set
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 95a400f042b1..7a5bbfc024b7 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -44,7 +44,9 @@ CONFIG_UBSAN_BOUNDS=y
# CONFIG_UBSAN_BOOL
# CONFIG_UBSAN_ENUM
# CONFIG_UBSAN_ALIGNMENT
-CONFIG_UBSAN_SANITIZE_ALL=y
+
+# Sampling-based heap out-of-bounds and use-after-free detection.
+CONFIG_KFENCE=y
# Linked list integrity checking.
CONFIG_LIST_HARDENED=y
@@ -93,6 +95,3 @@ CONFIG_SYN_COOKIES=y
# Attack surface reduction: Use the modern PTY interface (devpts) only.
# CONFIG_LEGACY_PTYS is not set
-
-# Attack surface reduction: Use only modesetting video drivers.
-# CONFIG_DRM_LEGACY is not set
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 75cd6a736d03..78b5dc7cee3a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,9 +11,14 @@
#include <linux/sizes.h>
#include <linux/kexec.h>
#include <linux/memory.h>
+#include <linux/mm.h>
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
#include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -26,451 +31,130 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
-u32 *vmcoreinfo_note;
-
-/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-static unsigned char *vmcoreinfo_data_safecopy;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
+#ifdef CONFIG_CRASH_DUMP
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
{
- char *cur = cmdline, *tmp;
- unsigned long long total_mem = system_ram;
+ struct page *vmcoreinfo_page;
+ void *safecopy;
+
+ if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+ return 0;
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
/*
- * Firmware sometimes reserves some memory regions for its own use,
- * so the system memory size is less than the actual physical memory
- * size. Work around this by rounding up the total size to 128M,
- * which is enough for most test cases.
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
*/
- total_mem = roundup(total_mem, SZ_128M);
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= total_mem) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (total_mem >= start && total_mem < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- } else
- pr_info("crashkernel size resulted in zero bytes\n");
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
}
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
}
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
return 0;
}
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
- return ck_cmdline;
-}
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *suffix)
+int kexec_should_crash(struct task_struct *p)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
- char *name = "crashkernel=";
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
- if (!ck_cmdline)
- return -ENOENT;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
/*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
*/
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- *
- * If crashkernel=,high|low is supported on architecture, non-NULL values
- * should be passed to parameters 'low_size' and 'high'.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- unsigned long long *low_size,
- bool *high)
-{
- int ret;
-
- /* crashkernel=X[@offset] */
- ret = __parse_crashkernel(cmdline, system_ram, crash_size,
- crash_base, NULL);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ if (crash_kexec_post_notifiers)
+ return 0;
/*
- * If non-NULL 'high' passed in and no normal crashkernel
- * setting detected, try parsing crashkernel=,high|low.
+ * There are 4 panic() calls in make_task_dead() path, each of which
+ * corresponds to each of these 4 conditions.
*/
- if (high && ret == -ENOENT) {
- ret = __parse_crashkernel(cmdline, 0, crash_size,
- crash_base, suffix_tbl[SUFFIX_HIGH]);
- if (ret || !*crash_size)
- return -EINVAL;
-
- /*
- * crashkernel=Y,low can be specified or not, but invalid value
- * is not allowed.
- */
- ret = __parse_crashkernel(cmdline, 0, low_size,
- crash_base, suffix_tbl[SUFFIX_LOW]);
- if (ret == -ENOENT) {
- *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- ret = 0;
- } else if (ret) {
- return ret;
- }
-
- *high = true;
- }
-#endif
- if (!*crash_size)
- ret = -EINVAL;
-
- return ret;
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
}
-/*
- * Add a dummy early_param handler to mark crashkernel= as a known command line
- * parameter and suppress incorrect warnings in init/main.c.
- */
-static int __init parse_crashkernel_dummy(char *arg)
+int kexec_crash_loaded(void)
{
- return 0;
+ return !!kexec_crash_image;
}
-early_param("crashkernel", parse_crashkernel_dummy);
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-static int __init reserve_crashkernel_low(unsigned long long low_size)
+/*
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __noclone __crash_kexec(struct pt_regs *regs)
{
-#ifdef CONFIG_64BIT
- unsigned long long low_base;
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
- return -ENOMEM;
+ /* Take the kexec_lock here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (kexec_trylock()) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ kexec_unlock();
}
-
- pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
- low_base, low_base + low_size, low_size >> 20);
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
-#endif
- return 0;
}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
-void __init reserve_crashkernel_generic(char *cmdline,
- unsigned long long crash_size,
- unsigned long long crash_base,
- unsigned long long crash_low_size,
- bool high)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
- unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
- bool fixed_base = false;
-
- /* User specifies base address explicitly. */
- if (crash_base) {
- fixed_base = true;
- search_base = crash_base;
- search_end = crash_base + crash_size;
- } else if (high) {
- search_base = CRASH_ADDR_LOW_MAX;
- search_end = CRASH_ADDR_HIGH_MAX;
- }
+ int old_cpu, this_cpu;
-retry:
- crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
- search_base, search_end);
- if (!crash_base) {
- /*
- * For crashkernel=size[KMG]@offset[KMG], print out failure
- * message if can't reserve the specified region.
- */
- if (fixed_base) {
- pr_warn("crashkernel reservation failed - memory is in use.\n");
- return;
- }
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
- /*
- * For crashkernel=size[KMG], if the first attempt was for
- * low memory, fall back to high memory, the minimum required
- * low memory will be reserved later.
- */
- if (!high && search_end == CRASH_ADDR_LOW_MAX) {
- search_end = CRASH_ADDR_HIGH_MAX;
- search_base = CRASH_ADDR_LOW_MAX;
- crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- goto retry;
- }
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
/*
- * For crashkernel=size[KMG],high, if the first attempt was
- * for high memory, fall back to low memory.
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
*/
- if (high && search_end == CRASH_ADDR_HIGH_MAX) {
- search_end = CRASH_ADDR_LOW_MAX;
- search_base = 0;
- goto retry;
- }
- pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
- crash_size);
- return;
- }
-
- if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
- crash_low_size && reserve_crashkernel_low(crash_low_size)) {
- memblock_phys_free(crash_base, crash_size);
- return;
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
}
-
- pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
- crash_base, crash_base + crash_size, crash_size >> 20);
-
- /*
- * The crashkernel memory will be removed from the kernel linear
- * map. Inform kmemleak so that it won't try to access it.
- */
- kmemleak_ignore_phys(crash_base);
- if (crashk_low_res.end)
- kmemleak_ignore_phys(crashk_low_res.start);
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
}
-static __init int insert_crashkernel_resources(void)
+static inline resource_size_t crash_resource_size(const struct resource *res)
{
- if (crashk_res.start < crashk_res.end)
- insert_resource(&iomem_resource, &crashk_res);
+ return !res->end ? 0 : resource_size(res);
+}
+
- if (crashk_low_res.start < crashk_low_res.end)
- insert_resource(&iomem_resource, &crashk_low_res);
- return 0;
-}
-early_initcall(insert_crashkernel_resources);
-#endif
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
@@ -633,205 +317,129 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
- void *data, size_t data_len)
+ssize_t crash_get_memory_size(void)
{
- struct elf_note *note = (struct elf_note *)buf;
-
- note->n_namesz = strlen(name) + 1;
- note->n_descsz = data_len;
- note->n_type = type;
- buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
- memcpy(buf, name, note->n_namesz);
- buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
- memcpy(buf, data, data_len);
- buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
-
- return buf;
-}
+ ssize_t size = 0;
-void final_note(Elf_Word *buf)
-{
- memset(buf, 0, sizeof(struct elf_note));
-}
+ if (!kexec_trylock())
+ return -EBUSY;
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
+ kexec_unlock();
+ return size;
}
-void crash_update_vmcoreinfo_safecopy(void *ptr)
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
{
- if (ptr)
- memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+ struct resource *ram_res;
- vmcoreinfo_data_safecopy = ptr;
-}
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
-void crash_save_vmcoreinfo(void)
-{
- if (!vmcoreinfo_note)
- return;
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ crashk_res.end = ram_res->start - 1;
+ }
- /* Use the safe copy to generate vmcoreinfo note if have */
- if (vmcoreinfo_data_safecopy)
- vmcoreinfo_data = vmcoreinfo_data_safecopy;
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
- vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
- update_vmcoreinfo_note();
+ return 0;
}
-void vmcoreinfo_append_str(const char *fmt, ...)
+int crash_shrink_memory(unsigned long new_size)
{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-
- WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
- "vmcoreinfo data exceeds allocated size, truncating");
-}
+ int ret = 0;
+ unsigned long old_size, low_size;
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
+ if (!kexec_trylock())
+ return -EBUSY;
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
- return __pa(vmcoreinfo_note);
-}
-EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
-static int __init crash_save_vmcoreinfo_init(void)
-{
- vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
- if (!vmcoreinfo_data) {
- pr_warn("Memory allocation for vmcoreinfo_data failed\n");
- return -ENOMEM;
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
}
- vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
- GFP_KERNEL | __GFP_ZERO);
- if (!vmcoreinfo_note) {
- free_page((unsigned long)vmcoreinfo_data);
- vmcoreinfo_data = NULL;
- pr_warn("Memory allocation for vmcoreinfo_note failed\n");
- return -ENOMEM;
+ /*
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
+ */
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
+
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
}
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_BUILD_ID();
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_OFFSET(uts_namespace, name);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
+unlock:
+ kexec_unlock();
+ return ret;
+}
-#ifndef CONFIG_NUMA
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL_ARRAY(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
- VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
- VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _refcount);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_head);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLATMEM
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
- log_buf_vmcoreinfo_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_swapbacked);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(PG_hugetlb);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
- VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
-#endif
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
-#ifdef CONFIG_KALLSYMS
- VMCOREINFO_SYMBOL(kallsyms_names);
- VMCOREINFO_SYMBOL(kallsyms_num_syms);
- VMCOREINFO_SYMBOL(kallsyms_token_table);
- VMCOREINFO_SYMBOL(kallsyms_token_index);
-#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
- VMCOREINFO_SYMBOL(kallsyms_offsets);
- VMCOREINFO_SYMBOL(kallsyms_relative_base);
-#else
- VMCOREINFO_SYMBOL(kallsyms_addresses);
-#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
-#endif /* CONFIG_KALLSYMS */
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
- return 0;
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.common.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
}
-subsys_initcall(crash_save_vmcoreinfo_init);
+
static int __init crash_notes_memory_init(void)
{
@@ -866,6 +474,8 @@ static int __init crash_notes_memory_init(void)
}
subsys_initcall(crash_notes_memory_init);
+#endif /*CONFIG_CRASH_DUMP*/
+
#ifdef CONFIG_CRASH_HOTPLUG
#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 000000000000..bbb6c3cb00e4
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= total_mem) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (total_mem >= start && total_mem < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+ if (!ck_cmdline)
+ return -ENOENT;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ bool *high)
+{
+ int ret;
+
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
+
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+}
+
+static __init int insert_crashkernel_resources(void)
+{
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index c033a201c808..075cfa7c896f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -606,8 +606,8 @@ int set_cred_ucounts(struct cred *new)
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ cred_jar = KMEM_CACHE(cred,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
/**
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index f005c66f378c..055da410ac71 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -37,12 +37,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
-
#include <asm/page.h>
#include <linux/memblock.h>
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 98b2e192fd69..4d543b1e9d57 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -286,7 +286,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
} else {
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
}
memset(ret, 0, size);
@@ -307,6 +307,8 @@ out_encrypt_pages:
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
+out_leak_pages:
+ return NULL;
}
void dma_direct_free(struct device *dev, size_t size,
@@ -367,12 +369,11 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
-out_free_pages:
- __dma_direct_free_pages(dev, page, size);
+out_leak_pages:
return NULL;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b079a9a8e087..86fe172b5958 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -956,6 +956,28 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
}
#endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+#ifdef CONFIG_DEBUG_FS
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_add(nslots, &mem->transient_nslabs);
+}
+
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->transient_nslabs);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
/**
* swiotlb_search_pool_area() - search one memory area in one pool
* @dev: Device which maps the buffer.
@@ -981,8 +1003,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
- unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) | alloc_align_mask;
+ unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int index, slots_checked, count = 0, i;
@@ -994,18 +1015,29 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
BUG_ON(area_index >= pool->nareas);
/*
- * For allocations of PAGE_SIZE or larger only look for page aligned
- * allocations.
+ * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
+ * page-aligned in the absence of any other alignment requirements.
+ * 'alloc_align_mask' was later introduced to specify the alignment
+ * explicitly, however this is passed as zero for streaming mappings
+ * and so we preserve the old behaviour there in case any drivers are
+ * relying on it.
*/
- if (alloc_size >= PAGE_SIZE)
- iotlb_align_mask |= ~PAGE_MASK;
- iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
+ if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE)
+ alloc_align_mask = PAGE_SIZE - 1;
+
+ /*
+ * Ensure that the allocation is at least slot-aligned and update
+ * 'iotlb_align_mask' to ignore bits that will be preserved when
+ * offsetting into the allocation.
+ */
+ alloc_align_mask |= (IO_TLB_SIZE - 1);
+ iotlb_align_mask &= ~alloc_align_mask;
/*
* For mappings with an alignment requirement don't bother looping to
* unaligned slots once we found an aligned one.
*/
- stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
+ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));
spin_lock_irqsave(&area->lock, flags);
if (unlikely(nslots > pool->area_nslabs - area->used))
@@ -1015,11 +1047,14 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
index = area->index;
for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ phys_addr_t tlb_addr;
+
slot_index = slot_base + index;
+ tlb_addr = slot_addr(tbl_dma_addr, slot_index);
- if (orig_addr &&
- (slot_addr(tbl_dma_addr, slot_index) &
- iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ if ((tlb_addr & alloc_align_mask) ||
+ (orig_addr && (tlb_addr & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask))) {
index = wrap_area_index(pool, index + 1);
slots_checked++;
continue;
@@ -1170,6 +1205,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+ inc_transient_used(mem, pool->nslabs);
found:
WRITE_ONCE(dev->dma_uses_io_tlb, true);
@@ -1415,6 +1451,7 @@ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
dec_used(dev->dma_io_tlb_mem, pool->nslabs);
swiotlb_del_pool(dev, pool);
+ dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs);
return true;
}
@@ -1557,6 +1594,23 @@ phys_addr_t default_swiotlb_limit(void)
}
#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+static unsigned long mem_transient_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->transient_nslabs);
+}
+
+static int io_tlb_transient_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_transient_used(mem);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get,
+ NULL, "%llu\n");
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
static int io_tlb_used_get(void *data, u64 *val)
{
@@ -1605,6 +1659,11 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
&fops_io_tlb_used);
debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
&fops_io_tlb_hiwater);
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ atomic_long_set(&mem->transient_nslabs, 0);
+ debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs,
+ mem, &fops_io_tlb_transient_used);
+#endif
}
static int __init swiotlb_create_default_debugfs(void)
@@ -1631,16 +1690,24 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
+ unsigned int align;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0, &pool);
+ align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
+ index = swiotlb_find_slots(dev, 0, size, align, &pool);
if (index == -1)
return NULL;
tlb_addr = slot_addr(pool->start, index);
+ if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
+ dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
+ &tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr);
+ return NULL;
+ }
return pfn_to_page(PFN_DOWN(tlb_addr));
}
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
index 92da32275af5..92da32275af5 100644
--- a/kernel/crash_dump.c
+++ b/kernel/elfcorehdr.c
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 88cb3c88aaa5..90843cc38588 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -57,8 +57,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
syscall_enter_audit(regs, syscall);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0f0f71213a1..724e6d7e128f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9302,10 +9302,6 @@ void perf_event_bpf_event(struct bpf_prog *prog,
{
struct perf_bpf_event bpf_event;
- if (type <= PERF_BPF_EVENT_UNKNOWN ||
- type >= PERF_BPF_EVENT_MAX)
- return;
-
switch (type) {
case PERF_BPF_EVENT_PROG_LOAD:
case PERF_BPF_EVENT_PROG_UNLOAD:
@@ -9313,7 +9309,7 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_event_bpf_emit_ksymbols(prog, type);
break;
default:
- break;
+ return;
}
if (!atomic_read(&nr_bpf_events))
@@ -10557,7 +10553,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
- if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
/* only uprobe programs are allowed to be sleepable */
return -EINVAL;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c62965..e4834d23e1d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
dec_mm_counter(mm, MM_ANONPAGES);
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9a24574988d2..b2fc2727d654 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e9..8a689b4ff4f9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
ITEM_DATA(kallsyms_test_var_data_static),
ITEM_DATA(kallsyms_test_var_bss),
ITEM_DATA(kallsyms_test_var_data),
- ITEM_DATA(vmap_area_list),
#endif
};
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8f35a5a42af8..bab542fc1463 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Verify we have a valid entry point */
if ((entry < phys_to_boot_phys(crashk_res.start)) ||
(entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
+#endif
/* Allocate and initialize a controlling structure */
image = do_kimage_alloc_init();
@@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
image->nr_segments = nr_segments;
memcpy(image->segment, segments, nr_segments * sizeof(*segments));
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = sanity_check_segment_list(image);
if (ret)
@@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (!kexec_trylock())
return -EBUSY;
+#ifdef CONFIG_CRASH_DUMP
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- } else {
+ } else
+#endif
dest_image = &kexec_image;
- }
if (nr_segments == 0) {
/* Uninstall image */
@@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
kimage_free(image);
out_unlock:
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d08fc7b5db97..0e96f6b24344 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,30 +54,6 @@ bool kexec_in_progress = false;
bool kexec_file_dbg_print;
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in make_task_dead() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-int kexec_crash_loaded(void)
-{
- return !!kexec_crash_image;
-}
-EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-
/*
* When kexec transitions to the new kernel there is a one-to-one
* mapping between physical and virtual addresses. On processors
@@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image)
if (total_pages > nr_pages / 2)
return -EINVAL;
+#ifdef CONFIG_CRASH_DUMP
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image)
return -EADDRNOTAVAIL;
}
}
+#endif
return 0;
}
@@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
return pages;
}
+#ifdef CONFIG_CRASH_DUMP
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
unsigned int order)
{
@@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
return pages;
}
+#endif
struct page *kimage_alloc_control_pages(struct kimage *image,
@@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
case KEXEC_TYPE_DEFAULT:
pages = kimage_alloc_normal_control_pages(image, order);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
pages = kimage_alloc_crash_control_pages(image, order);
break;
+#endif
}
return pages;
}
-int kimage_crash_copy_vmcoreinfo(struct kimage *image)
-{
- struct page *vmcoreinfo_page;
- void *safecopy;
-
- if (image->type != KEXEC_TYPE_CRASH)
- return 0;
-
- /*
- * For kdump, allocate one vmcoreinfo safe copy from the
- * crash memory. as we have arch_kexec_protect_crashkres()
- * after kexec syscall, we naturally protect it from write
- * (even read) access under kernel direct mapping. But on
- * the other hand, we still need to operate it when crash
- * happens to generate vmcoreinfo note, hereby we rely on
- * vmap for this purpose.
- */
- vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
- if (!vmcoreinfo_page) {
- pr_warn("Could not allocate vmcoreinfo buffer\n");
- return -ENOMEM;
- }
- safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
- if (!safecopy) {
- pr_warn("Could not vmap vmcoreinfo buffer\n");
- return -ENOMEM;
- }
-
- image->vmcoreinfo_data_copy = safecopy;
- crash_update_vmcoreinfo_safecopy(safecopy);
-
- return 0;
-}
-
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
if (*image->entry != 0)
@@ -603,10 +551,12 @@ void kimage_free(struct kimage *image)
if (!image)
return;
+#ifdef CONFIG_CRASH_DUMP
if (image->vmcoreinfo_data_copy) {
crash_update_vmcoreinfo_safecopy(NULL);
vunmap(image->vmcoreinfo_data_copy);
}
+#endif
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
@@ -800,22 +750,24 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -824,6 +776,7 @@ out:
return result;
}
+#ifdef CONFIG_CRASH_DUMP
static int kimage_load_crash_segment(struct kimage *image,
struct kexec_segment *segment)
{
@@ -866,11 +819,18 @@ static int kimage_load_crash_segment(struct kimage *image,
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
kexec_flush_icache_page(page);
kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
@@ -878,12 +838,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -891,6 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
out:
return result;
}
+#endif
int kimage_load_segment(struct kimage *image,
struct kexec_segment *segment)
@@ -901,9 +857,11 @@ int kimage_load_segment(struct kimage *image,
case KEXEC_TYPE_DEFAULT:
result = kimage_load_normal_segment(image, segment);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
result = kimage_load_crash_segment(image, segment);
break;
+#endif
}
return result;
@@ -1028,186 +986,6 @@ bool kexec_load_permitted(int kexec_image_type)
}
/*
- * No panic_cpu check version of crash_kexec(). This function is called
- * only when panic_cpu holds the current CPU number; this is the only CPU
- * which processes crash_kexec routines.
- */
-void __noclone __crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_lock here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (kexec_trylock()) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- kexec_unlock();
- }
-}
-STACK_FRAME_NON_STANDARD(__crash_kexec);
-
-__bpf_kfunc void crash_kexec(struct pt_regs *regs)
-{
- int old_cpu, this_cpu;
-
- /*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
- */
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
- /* This is the 1st CPU which comes here, so go ahead. */
- __crash_kexec(regs);
-
- /*
- * Reset panic_cpu to allow another panic()/crash_kexec()
- * call.
- */
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
- }
-}
-
-static inline resource_size_t crash_resource_size(const struct resource *res)
-{
- return !res->end ? 0 : resource_size(res);
-}
-
-ssize_t crash_get_memory_size(void)
-{
- ssize_t size = 0;
-
- if (!kexec_trylock())
- return -EBUSY;
-
- size += crash_resource_size(&crashk_res);
- size += crash_resource_size(&crashk_low_res);
-
- kexec_unlock();
- return size;
-}
-
-static int __crash_shrink_memory(struct resource *old_res,
- unsigned long new_size)
-{
- struct resource *ram_res;
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res)
- return -ENOMEM;
-
- ram_res->start = old_res->start + new_size;
- ram_res->end = old_res->end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
- ram_res->name = "System RAM";
-
- if (!new_size) {
- release_resource(old_res);
- old_res->start = 0;
- old_res->end = 0;
- } else {
- crashk_res.end = ram_res->start - 1;
- }
-
- crash_free_reserved_phys_range(ram_res->start, ram_res->end);
- insert_resource(&iomem_resource, ram_res);
-
- return 0;
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long old_size, low_size;
-
- if (!kexec_trylock())
- return -EBUSY;
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
-
- low_size = crash_resource_size(&crashk_low_res);
- old_size = crash_resource_size(&crashk_res) + low_size;
- new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- /*
- * (low_size > new_size) implies that low_size is greater than zero.
- * This also means that if low_size is zero, the else branch is taken.
- *
- * If low_size is greater than 0, (low_size > new_size) indicates that
- * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
- * needs to be shrunken.
- */
- if (low_size > new_size) {
- ret = __crash_shrink_memory(&crashk_res, 0);
- if (ret)
- goto unlock;
-
- ret = __crash_shrink_memory(&crashk_low_res, new_size);
- } else {
- ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
- }
-
- /* Swap crashk_res and crashk_low_res if needed */
- if (!crashk_res.end && crashk_low_res.end) {
- crashk_res.start = crashk_low_res.start;
- crashk_res.end = crashk_low_res.end;
- release_resource(&crashk_low_res);
- crashk_low_res.start = 0;
- crashk_low_res.end = 0;
- insert_resource(&iomem_resource, &crashk_res);
- }
-
-unlock:
- kexec_unlock();
- return ret;
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.common.pr_pid = current->pid;
- elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bef2f6f2571b..2d1db05fbf04 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
cmdline_ptr, cmdline_len, flags);
@@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (!kexec_trylock())
return -EBUSY;
+#ifdef CONFIG_CRASH_DUMP
if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- } else {
+ } else
+#endif
dest_image = &kexec_image;
- }
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
kexec_unlock();
kimage_free(image);
@@ -535,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
phys_addr_t mstart, mend;
struct resource res = { };
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+#endif
/*
* Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
@@ -595,12 +602,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
static int kexec_walk_resources(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
- else if (kbuf->top_down)
+#endif
+ if (kbuf->top_down)
return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 74da1409cd14..2595defe8c0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -4,6 +4,8 @@
#include <linux/kexec.h>
+struct kexec_segment;
+
struct kimage *do_kimage_alloc_init(void);
int sanity_check_segment_list(struct kimage *image);
void kimage_free_page_list(struct list_head *list);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1d4bc493b2f4..495b69a71a5d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum));
}
KERNEL_ATTR_RO(uevent_seqnum);
@@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(kexec_loaded);
+#ifdef CONFIG_CRASH_DUMP
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -152,9 +153,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(kexec_crash_size);
+#endif /* CONFIG_CRASH_DUMP*/
#endif /* CONFIG_KEXEC_CORE */
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -177,7 +179,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size);
#endif
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -262,10 +264,12 @@ static struct attribute * kernel_attrs[] = {
#endif
#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
#endif
-#ifdef CONFIG_CRASH_CORE
+#endif
+#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
#ifdef CONFIG_CRASH_HOTPLUG
&crash_elfcorehdr_size_attr.attr,
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 0ea1b2970a23..c3ced519e14b 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -362,8 +362,7 @@ config MODPROBE_PATH
userspace can still load modules explicitly).
config TRIM_UNUSED_KSYMS
- bool "Trim unused exported kernel symbols" if EXPERT
- depends on !COMPILE_TEST
+ bool "Trim unused exported kernel symbols"
help
The kernel and some modules make many symbols available for
other modules to use via EXPORT_SYMBOL() and variants. Depending
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index c8b7b4dcf782..2ebece8a789f 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -322,9 +322,9 @@ static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *
}
#endif /* CONFIG_MODULES_TREE_LOOKUP */
-void module_enable_ro(const struct module *mod, bool after_init);
-void module_enable_nx(const struct module *mod);
-void module_enable_x(const struct module *mod);
+int module_enable_rodata_ro(const struct module *mod, bool after_init);
+int module_enable_data_nx(const struct module *mod);
+int module_enable_text_rox(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 36681911c05a..e1e8a7a9d6c1 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w)
}
}
+void flush_module_init_free_work(void)
+{
+ flush_work(&init_free_wq);
+}
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "module."
/* Default value for module->async_probe_requested */
@@ -2571,7 +2576,9 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
- module_enable_ro(mod, true);
+ ret = module_enable_rodata_ro(mod, true);
+ if (ret)
+ goto fail_mutex_unlock;
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
for_class_mod_mem_type(type, init) {
@@ -2593,8 +2600,8 @@ static noinline int do_init_module(struct module *mod)
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
- * be cleaned up needs to sync with the queued work - ie
- * rcu_barrier()
+ * be cleaned up needs to sync with the queued work by invoking
+ * flush_module_init_free_work().
*/
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
@@ -2609,6 +2616,8 @@ static noinline int do_init_module(struct module *mod)
return 0;
+fail_mutex_unlock:
+ mutex_unlock(&module_mutex);
fail_free_freeinit:
kfree(freeinit);
fail:
@@ -2736,9 +2745,15 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_bug_finalize(info->hdr, info->sechdrs, mod);
module_cfi_finalize(info->hdr, info->sechdrs, mod);
- module_enable_ro(mod, false);
- module_enable_nx(mod);
- module_enable_x(mod);
+ err = module_enable_rodata_ro(mod, false);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_data_nx(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_text_rox(mod);
+ if (err)
+ goto out_strict_rwx;
/*
* Mark state as coming so strong_try_module_get() ignores us,
@@ -2749,6 +2764,8 @@ static int complete_formation(struct module *mod, struct load_info *info)
return 0;
+out_strict_rwx:
+ module_bug_cleanup(mod);
out:
mutex_unlock(&module_mutex);
return err;
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index a2b656b4e3d2..c45caa4690e5 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -11,13 +11,16 @@
#include <linux/set_memory.h>
#include "internal.h"
-static void module_set_memory(const struct module *mod, enum mod_mem_type type,
- int (*set_memory)(unsigned long start, int num_pages))
+static int module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
{
const struct module_memory *mod_mem = &mod->mem[type];
+ if (!mod_mem->base)
+ return 0;
+
set_vm_flush_reset_perms(mod_mem->base);
- set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+ return set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
}
/*
@@ -26,37 +29,53 @@ static void module_set_memory(const struct module *mod, enum mod_mem_type type,
* CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
* are strict.
*/
-void module_enable_x(const struct module *mod)
+int module_enable_text_rox(const struct module *mod)
{
- for_class_mod_mem_type(type, text)
- module_set_memory(mod, type, set_memory_x);
+ for_class_mod_mem_type(type, text) {
+ int ret;
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ ret = module_set_memory(mod, type, set_memory_rox);
+ else
+ ret = module_set_memory(mod, type, set_memory_x);
+ if (ret)
+ return ret;
+ }
+ return 0;
}
-void module_enable_ro(const struct module *mod, bool after_init)
+int module_enable_rodata_ro(const struct module *mod, bool after_init)
{
- if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return;
-#ifdef CONFIG_STRICT_MODULE_RWX
- if (!rodata_enabled)
- return;
-#endif
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
- module_set_memory(mod, MOD_TEXT, set_memory_ro);
- module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
- module_set_memory(mod, MOD_RODATA, set_memory_ro);
- module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ ret = module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+ ret = module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
if (after_init)
- module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+ return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+
+ return 0;
}
-void module_enable_nx(const struct module *mod)
+int module_enable_data_nx(const struct module *mod)
{
if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return;
+ return 0;
- for_class_mod_mem_type(type, data)
- module_set_memory(mod, type, set_memory_nx);
+ for_class_mod_mem_type(type, data) {
+ int ret = module_set_memory(mod, type, set_memory_nx);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
}
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070..e3f639ff1670 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
- int nworks;
+ int nworks, nid;
+ static atomic_t last_used_nid __initdata;
if (job->size == 0)
return;
@@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
ps.chunk_size = roundup(ps.chunk_size, job->align);
list_for_each_entry(pw, &works, pw_list)
- queue_work(system_unbound_wq, &pw->pw_work);
+ if (job->numa_aware) {
+ int old_node = atomic_read(&last_used_nid);
+
+ do {
+ nid = next_node_in(old_node, node_states[N_CPU]);
+ } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+ queue_work_node(nid, system_unbound_wq, &pw->pw_work);
+ } else {
+ queue_work(system_unbound_wq, &pw->pw_work);
+ }
/* Use the current thread, which saves starting a workqueue worker. */
padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/kernel/panic.c b/kernel/panic.c
index 2807639aab51..747c3f3d289a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
#define PANIC_PRINT_ALL_CPU_BT 0x00000040
+#define PANIC_PRINT_BLOCKED_TASKS 0x00000080
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -227,6 +228,9 @@ static void panic_print_sys_info(bool console_flush)
if (panic_print & PANIC_PRINT_FTRACE_INFO)
ftrace_dump(DUMP_ALL);
+
+ if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
+ show_state_filter(TASK_UNINTERRUPTIBLE);
}
void check_panic_on_warn(const char *origin)
@@ -446,6 +450,14 @@ void panic(const char *fmt, ...)
/* Do not scroll important messages printed above */
suppress_printk = 1;
+
+ /*
+ * The final messages may not have been printed if in a context that
+ * defers printing (such as NMI) and irq_work is not available.
+ * Explicitly flush the kernel log buffer one last time.
+ */
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -666,8 +678,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
raw_smp_processor_id(), current->pid, caller);
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
if (args)
vprintk(args->fmt, args->args);
+#pragma GCC diagnostic pop
print_modules();
diff --git a/kernel/pid.c b/kernel/pid.c
index 99a0c5eb24b8..da76ed1873f7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -62,17 +62,13 @@ struct pid init_struct_pid = {
int pid_max = PID_MAX_DEFAULT;
-#define RESERVED_PIDS 300
-
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#ifdef CONFIG_FS_PID
/*
* Pseudo filesystems start inode numbering after one. We use Reserved
* PIDs as a natural offset.
*/
static u64 pidfs_ino = RESERVED_PIDS;
-#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -280,10 +276,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
-#ifdef CONFIG_FS_PID
pid->stashed = NULL;
pid->ino = ++pidfs_ino;
-#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4b31629c5be4..afce8130d8b9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -39,9 +39,9 @@ config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on SWAP && ARCH_HIBERNATION_POSSIBLE
select HIBERNATE_CALLBACKS
- select LZO_COMPRESS
- select LZO_DECOMPRESS
select CRC32
+ select CRYPTO
+ select CRYPTO_LZO
help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -92,6 +92,28 @@ config HIBERNATION_SNAPSHOT_DEV
If in doubt, say Y.
+choice
+ prompt "Default compressor"
+ default HIBERNATION_COMP_LZO
+ depends on HIBERNATION
+
+config HIBERNATION_COMP_LZO
+ bool "lzo"
+ depends on CRYPTO_LZO
+
+config HIBERNATION_COMP_LZ4
+ bool "lz4"
+ depends on CRYPTO_LZ4
+
+endchoice
+
+config HIBERNATION_DEF_COMP
+ string
+ default "lzo" if HIBERNATION_COMP_LZO
+ default "lz4" if HIBERNATION_COMP_LZ4
+ help
+ Default compressor to be used for hibernation.
+
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7b44f5b89fa1..9e1c9aa399ea 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -23,6 +23,12 @@
*/
static DEFINE_MUTEX(em_pd_mutex);
+static void em_cpufreq_update_efficiencies(struct device *dev,
+ struct em_perf_state *table);
+static void em_check_capacity_update(void);
+static void em_update_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
+
static bool _is_cpu_device(struct device *dev)
{
return (dev->bus == &cpu_subsys);
@@ -31,19 +37,65 @@ static bool _is_cpu_device(struct device *dev)
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
+struct em_dbg_info {
+ struct em_perf_domain *pd;
+ int ps_id;
+};
+
+#define DEFINE_EM_DBG_SHOW(name, fname) \
+static int em_debug_##fname##_show(struct seq_file *s, void *unused) \
+{ \
+ struct em_dbg_info *em_dbg = s->private; \
+ struct em_perf_state *table; \
+ unsigned long val; \
+ \
+ rcu_read_lock(); \
+ table = em_perf_state_from_pd(em_dbg->pd); \
+ val = table[em_dbg->ps_id].name; \
+ rcu_read_unlock(); \
+ \
+ seq_printf(s, "%lu\n", val); \
+ return 0; \
+} \
+DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
+
+DEFINE_EM_DBG_SHOW(frequency, frequency);
+DEFINE_EM_DBG_SHOW(power, power);
+DEFINE_EM_DBG_SHOW(cost, cost);
+DEFINE_EM_DBG_SHOW(performance, performance);
+DEFINE_EM_DBG_SHOW(flags, inefficiency);
+
+static void em_debug_create_ps(struct em_perf_domain *em_pd,
+ struct em_dbg_info *em_dbg, int i,
+ struct dentry *pd)
{
+ struct em_perf_state *table;
+ unsigned long freq;
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
+ em_dbg[i].pd = em_pd;
+ em_dbg[i].ps_id = i;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(em_pd);
+ freq = table[i].frequency;
+ rcu_read_unlock();
+
+ snprintf(name, sizeof(name), "ps:%lu", freq);
/* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
- debugfs_create_ulong("power", 0444, d, &ps->power);
- debugfs_create_ulong("cost", 0444, d, &ps->cost);
- debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
+ debugfs_create_file("frequency", 0444, d, &em_dbg[i],
+ &em_debug_frequency_fops);
+ debugfs_create_file("power", 0444, d, &em_dbg[i],
+ &em_debug_power_fops);
+ debugfs_create_file("cost", 0444, d, &em_dbg[i],
+ &em_debug_cost_fops);
+ debugfs_create_file("performance", 0444, d, &em_dbg[i],
+ &em_debug_performance_fops);
+ debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
+ &em_debug_inefficiency_fops);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -66,6 +118,7 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
+ struct em_dbg_info *em_dbg;
struct dentry *d;
int i;
@@ -79,9 +132,14 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("flags", 0444, d, dev->em_pd,
&em_debug_flags_fops);
+ em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
+ sizeof(*em_dbg), GFP_KERNEL);
+ if (!em_dbg)
+ return;
+
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
- em_debug_create_ps(&dev->em_pd->table[i], d);
+ em_debug_create_ps(dev->em_pd, em_dbg, i, d);
}
@@ -103,18 +161,192 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
+static void em_destroy_table_rcu(struct rcu_head *rp)
+{
+ struct em_perf_table __rcu *table;
+
+ table = container_of(rp, struct em_perf_table, rcu);
+ kfree(table);
+}
+
+static void em_release_table_kref(struct kref *kref)
+{
+ struct em_perf_table __rcu *table;
+
+ /* It was the last owner of this table so we can free */
+ table = container_of(kref, struct em_perf_table, kref);
+
+ call_rcu(&table->rcu, em_destroy_table_rcu);
+}
+
+/**
+ * em_table_free() - Handles safe free of the EM table when needed
+ * @table : EM table which is going to be freed
+ *
+ * No return values.
+ */
+void em_table_free(struct em_perf_table __rcu *table)
+{
+ kref_put(&table->kref, em_release_table_kref);
+}
+
+/**
+ * em_table_alloc() - Allocate a new EM table
+ * @pd : EM performance domain for which this must be done
+ *
+ * Allocate a new EM table and initialize its kref to indicate that it
+ * has a user.
+ * Returns allocated table or NULL.
+ */
+struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd)
+{
+ struct em_perf_table __rcu *table;
+ int table_size;
+
+ table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+
+ table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ kref_init(&table->kref);
+
+ return table;
+}
+
+static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table, int nr_states)
+{
+ u64 fmax, max_cap;
+ int i, cpu;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return;
+
+ cpu = cpumask_first(em_span_cpus(pd));
+
+ /*
+ * Calculate the performance value for each frequency with
+ * linear relationship. The final CPU capacity might not be ready at
+ * boot time, but the EM will be updated a bit later with correct one.
+ */
+ fmax = (u64) table[nr_states - 1].frequency;
+ max_cap = (u64) arch_scale_cpu_capacity(cpu);
+ for (i = 0; i < nr_states; i++)
+ table[i].performance = div64_u64(max_cap * table[i].frequency,
+ fmax);
+}
+
+static int em_compute_costs(struct device *dev, struct em_perf_state *table,
+ struct em_data_callback *cb, int nr_states,
+ unsigned long flags)
+{
+ unsigned long prev_cost = ULONG_MAX;
+ int i, ret;
+
+ /* Compute the cost of each performance state. */
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ return -EINVAL;
+ }
+ } else {
+ /* increase resolution of 'cost' precision */
+ power_res = table[i].power * 10;
+ cost = power_res / table[i].performance;
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * em_dev_compute_costs() - Calculate cost values for new runtime EM table
+ * @dev : Device for which the EM table is to be updated
+ * @table : The new EM table that is going to get the costs calculated
+ * @nr_states : Number of performance states
+ *
+ * Calculate the em_perf_state::cost values for new runtime EM table. The
+ * values are used for EAS during task placement. It also calculates and sets
+ * the efficiency flag for each performance state. When the function finish
+ * successfully the EM table is ready to be updated and used by EAS.
+ *
+ * Return 0 on success or a proper error in case of failure.
+ */
+int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
+ int nr_states)
+{
+ return em_compute_costs(dev, table, NULL, nr_states, 0);
+}
+
+/**
+ * em_dev_update_perf_domain() - Update runtime EM table for a device
+ * @dev : Device for which the EM is to be updated
+ * @new_table : The new EM table that is going to be used from now
+ *
+ * Update EM runtime modifiable table for the @dev using the provided @table.
+ *
+ * This function uses a mutex to serialize writers, so it must not be called
+ * from a non-sleeping context.
+ *
+ * Return 0 on success or an error code on failure.
+ */
+int em_dev_update_perf_domain(struct device *dev,
+ struct em_perf_table __rcu *new_table)
+{
+ struct em_perf_table __rcu *old_table;
+ struct em_perf_domain *pd;
+
+ if (!dev)
+ return -EINVAL;
+
+ /* Serialize update/unregister or concurrent updates */
+ mutex_lock(&em_pd_mutex);
+
+ if (!dev->em_pd) {
+ mutex_unlock(&em_pd_mutex);
+ return -EINVAL;
+ }
+ pd = dev->em_pd;
+
+ kref_get(&new_table->kref);
+
+ old_table = pd->em_table;
+ rcu_assign_pointer(pd->em_table, new_table);
+
+ em_cpufreq_update_efficiencies(dev, new_table->state);
+
+ em_table_free(old_table);
+
+ mutex_unlock(&em_pd_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
+
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb,
+ struct em_perf_state *table,
+ struct em_data_callback *cb,
unsigned long flags)
{
- unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
- struct em_perf_state *table;
+ unsigned long power, freq, prev_freq = 0;
+ int nr_states = pd->nr_perf_states;
int i, ret;
- u64 fmax;
-
- table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
- if (!table)
- return -ENOMEM;
/* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
@@ -127,7 +359,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
- goto free_ps_table;
+ return -EINVAL;
}
/*
@@ -137,7 +369,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (freq <= prev_freq) {
dev_err(dev, "EM: non-increasing freq: %lu\n",
freq);
- goto free_ps_table;
+ return -EINVAL;
}
/*
@@ -147,55 +379,27 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
power);
- goto free_ps_table;
+ return -EINVAL;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
}
- /* Compute the cost of each performance state. */
- fmax = (u64) table[nr_states - 1].frequency;
- for (i = nr_states - 1; i >= 0; i--) {
- unsigned long power_res, cost;
-
- if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
- ret = cb->get_cost(dev, table[i].frequency, &cost);
- if (ret || !cost || cost > EM_MAX_POWER) {
- dev_err(dev, "EM: invalid cost %lu %d\n",
- cost, ret);
- goto free_ps_table;
- }
- } else {
- power_res = table[i].power;
- cost = div64_u64(fmax * power_res, table[i].frequency);
- }
-
- table[i].cost = cost;
-
- if (table[i].cost >= prev_cost) {
- table[i].flags = EM_PERF_STATE_INEFFICIENT;
- dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
- table[i].frequency);
- } else {
- prev_cost = table[i].cost;
- }
- }
+ em_init_performance(dev, pd, table, nr_states);
- pd->table = table;
- pd->nr_perf_states = nr_states;
+ ret = em_compute_costs(dev, table, cb, nr_states, flags);
+ if (ret)
+ return -EINVAL;
return 0;
-
-free_ps_table:
- kfree(table);
- return -EINVAL;
}
static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
unsigned long flags)
{
+ struct em_perf_table __rcu *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -220,11 +424,17 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
- if (ret) {
- kfree(pd);
- return ret;
- }
+ pd->nr_perf_states = nr_states;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ goto free_pd;
+
+ ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
+ if (ret)
+ goto free_pd_table;
+
+ rcu_assign_pointer(pd->em_table, em_table);
if (_is_cpu_device(dev))
for_each_cpu(cpu, cpus) {
@@ -235,26 +445,37 @@ static int em_create_pd(struct device *dev, int nr_states,
dev->em_pd = pd;
return 0;
+
+free_pd_table:
+ kfree(em_table);
+free_pd:
+ kfree(pd);
+ return -EINVAL;
}
-static void em_cpufreq_update_efficiencies(struct device *dev)
+static void
+em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
{
struct em_perf_domain *pd = dev->em_pd;
- struct em_perf_state *table;
struct cpufreq_policy *policy;
int found = 0;
- int i;
+ int i, cpu;
- if (!_is_cpu_device(dev) || !pd)
+ if (!_is_cpu_device(dev))
return;
- policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
- if (!policy) {
- dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ /* Try to get a CPU which is active and in this PD */
+ cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
return;
}
- table = pd->table;
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
+ return;
+ }
for (i = 0; i < pd->nr_perf_states; i++) {
if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
@@ -391,19 +612,34 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
else if (cb->get_cost)
flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+ /*
+ * EM only supports uW (exception is artificial EM).
+ * Therefore, check and force the drivers to provide
+ * power in uW.
+ */
+ if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
+ dev_err(dev, "EM: only supports uW power values\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
dev->em_pd->flags |= flags;
- em_cpufreq_update_efficiencies(dev);
+ em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
unlock:
mutex_unlock(&em_pd_mutex);
+
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
return ret;
}
EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
@@ -430,9 +666,125 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- kfree(dev->em_pd->table);
+ em_table_free(dev->em_pd->em_table);
+
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
+
+/*
+ * Adjustment of CPU performance values after boot, when all CPUs capacites
+ * are correctly calculated.
+ */
+static void em_adjust_new_capacity(struct device *dev,
+ struct em_perf_domain *pd,
+ u64 max_cap)
+{
+ struct em_perf_table __rcu *em_table;
+ struct em_perf_state *ps, *new_ps;
+ int ret, ps_size;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return;
+ }
+
+ new_ps = em_table->state;
+
+ rcu_read_lock();
+ ps = em_perf_state_from_pd(pd);
+ /* Initialize data based on old table */
+ ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+ memcpy(new_ps, ps, ps_size);
+
+ rcu_read_unlock();
+
+ em_init_performance(dev, pd, new_ps, pd->nr_perf_states);
+ ret = em_compute_costs(dev, new_ps, NULL, pd->nr_perf_states,
+ pd->flags);
+ if (ret) {
+ dev_warn(dev, "EM: compute costs failed\n");
+ return;
+ }
+
+ ret = em_dev_update_perf_domain(dev, em_table);
+ if (ret)
+ dev_warn(dev, "EM: update failed %d\n", ret);
+
+ /*
+ * This is one-time-update, so give up the ownership in this updater.
+ * The EM framework has incremented the usage counter and from now
+ * will keep the reference (then free the memory when needed).
+ */
+ em_table_free(em_table);
+}
+
+static void em_check_capacity_update(void)
+{
+ cpumask_var_t cpu_done_mask;
+ struct em_perf_state *table;
+ struct em_perf_domain *pd;
+ unsigned long cpu_capacity;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
+ pr_warn("no free memory\n");
+ return;
+ }
+
+ /* Check if CPUs capacity has changed than update EM */
+ for_each_possible_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ unsigned long em_max_perf;
+ struct device *dev;
+
+ if (cpumask_test_cpu(cpu, cpu_done_mask))
+ continue;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ pr_debug("Accessing cpu%d policy failed\n", cpu);
+ schedule_delayed_work(&em_update_work,
+ msecs_to_jiffies(1000));
+ break;
+ }
+ cpufreq_cpu_put(policy);
+
+ pd = em_cpu_get(cpu);
+ if (!pd || em_is_artificial(pd))
+ continue;
+
+ cpumask_or(cpu_done_mask, cpu_done_mask,
+ em_span_cpus(pd));
+
+ cpu_capacity = arch_scale_cpu_capacity(cpu);
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ /*
+ * Check if the CPU capacity has been adjusted during boot
+ * and trigger the update for new performance values.
+ */
+ if (em_max_perf == cpu_capacity)
+ continue;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
+ cpu, cpu_capacity, em_max_perf);
+
+ dev = get_cpu_device(cpu);
+ em_adjust_new_capacity(dev, pd, cpu_capacity);
+ }
+
+ free_cpumask_var(cpu_done_mask);
+}
+
+static void em_update_workfn(struct work_struct *work)
+{
+ em_check_capacity_update();
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 4b0b7cf2e019..43b1a82e800c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -47,6 +47,15 @@ dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
+static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP;
+
+/*
+ * Compression/decompression algorithm to be used while saving/loading
+ * image to/from disk. This would later be used in 'kernel/power/swap.c'
+ * to allocate comp streams.
+ */
+char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
@@ -718,6 +727,9 @@ static int load_image_and_restore(void)
return error;
}
+#define COMPRESSION_ALGO_LZO "lzo"
+#define COMPRESSION_ALGO_LZ4 "lz4"
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
@@ -732,6 +744,17 @@ int hibernate(void)
return -EPERM;
}
+ /*
+ * Query for the compression algorithm support if compression is enabled.
+ */
+ if (!nocompress) {
+ strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo));
+ if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ return -EOPNOTSUPP;
+ }
+ }
+
sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
@@ -766,11 +789,24 @@ int hibernate(void)
if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE;
- if (nocompress)
+ if (nocompress) {
flags |= SF_NOCOMPRESS_MODE;
- else
+ } else {
flags |= SF_CRC32_MODE;
+ /*
+ * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4
+ * to override this behaviour and use LZ4.
+ *
+ * Refer kernel/power/power.h for more details
+ */
+
+ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4))
+ flags |= SF_COMPRESSION_ALG_LZ4;
+ else
+ flags |= SF_COMPRESSION_ALG_LZO;
+ }
+
pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
@@ -955,6 +991,22 @@ static int software_resume(void)
if (error)
goto Unlock;
+ /*
+ * Check if the hibernation image is compressed. If so, query for
+ * the algorithm support.
+ */
+ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
+ if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo));
+ else
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo));
+ if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ error = -EOPNOTSUPP;
+ goto Unlock;
+ }
+ }
+
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -1370,6 +1422,57 @@ static int __init nohibernate_setup(char *str)
return 1;
}
+static const char * const comp_alg_enabled[] = {
+#if IS_ENABLED(CONFIG_CRYPTO_LZO)
+ COMPRESSION_ALGO_LZO,
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+ COMPRESSION_ALGO_LZ4,
+#endif
+};
+
+static int hibernate_compressor_param_set(const char *compressor,
+ const struct kernel_param *kp)
+{
+ unsigned int sleep_flags;
+ int index, ret;
+
+ sleep_flags = lock_system_sleep();
+
+ index = sysfs_match_string(comp_alg_enabled, compressor);
+ if (index >= 0) {
+ ret = param_set_copystring(comp_alg_enabled[index], kp);
+ if (!ret)
+ strscpy(hib_comp_algo, comp_alg_enabled[index],
+ sizeof(hib_comp_algo));
+ } else {
+ ret = index;
+ }
+
+ unlock_system_sleep(sleep_flags);
+
+ if (ret)
+ pr_debug("Cannot set specified compressor %s\n",
+ compressor);
+
+ return ret;
+}
+
+static const struct kernel_param_ops hibernate_compressor_param_ops = {
+ .set = hibernate_compressor_param_set,
+ .get = param_get_string,
+};
+
+static struct kparam_string hibernate_compressor_param_string = {
+ .maxlen = sizeof(hibernate_compressor),
+ .string = hibernate_compressor,
+};
+
+module_param_cb(compressor, &hibernate_compressor_param_ops,
+ &hibernate_compressor_param_string, 0644);
+MODULE_PARM_DESC(compressor,
+ "Compression algorithm to be used with hibernation");
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b1ae9b677d03..a9e0693aaf69 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -95,19 +95,6 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-void pm_report_hw_sleep_time(u64 t)
-{
- suspend_stats.last_hw_sleep = t;
- suspend_stats.total_hw_sleep += t;
-}
-EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
-
-void pm_report_max_hw_sleep(u64 t)
-{
- suspend_stats.max_hw_sleep = t;
-}
-EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
-
int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
@@ -319,26 +306,86 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP_DEBUG */
-static char *suspend_step_name(enum suspend_stat_step step)
-{
- switch (step) {
- case SUSPEND_FREEZE:
- return "freeze";
- case SUSPEND_PREPARE:
- return "prepare";
- case SUSPEND_SUSPEND:
- return "suspend";
- case SUSPEND_SUSPEND_NOIRQ:
- return "suspend_noirq";
- case SUSPEND_RESUME_NOIRQ:
- return "resume_noirq";
- case SUSPEND_RESUME:
- return "resume";
- default:
- return "";
+#define SUSPEND_NR_STEPS SUSPEND_RESUME
+#define REC_FAILED_NUM 2
+
+struct suspend_stats {
+ unsigned int step_failures[SUSPEND_NR_STEPS];
+ unsigned int success;
+ unsigned int fail;
+ int last_failed_dev;
+ char failed_devs[REC_FAILED_NUM][40];
+ int last_failed_errno;
+ int errno[REC_FAILED_NUM];
+ int last_failed_step;
+ u64 last_hw_sleep;
+ u64 total_hw_sleep;
+ u64 max_hw_sleep;
+ enum suspend_stat_step failed_steps[REC_FAILED_NUM];
+};
+
+static struct suspend_stats suspend_stats;
+static DEFINE_MUTEX(suspend_stats_lock);
+
+void dpm_save_failed_dev(const char *name)
+{
+ mutex_lock(&suspend_stats_lock);
+
+ strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+ name, sizeof(suspend_stats.failed_devs[0]));
+ suspend_stats.last_failed_dev++;
+ suspend_stats.last_failed_dev %= REC_FAILED_NUM;
+
+ mutex_unlock(&suspend_stats_lock);
+}
+
+void dpm_save_failed_step(enum suspend_stat_step step)
+{
+ suspend_stats.step_failures[step-1]++;
+ suspend_stats.failed_steps[suspend_stats.last_failed_step] = step;
+ suspend_stats.last_failed_step++;
+ suspend_stats.last_failed_step %= REC_FAILED_NUM;
+}
+
+void dpm_save_errno(int err)
+{
+ if (!err) {
+ suspend_stats.success++;
+ return;
}
+
+ suspend_stats.fail++;
+
+ suspend_stats.errno[suspend_stats.last_failed_errno] = err;
+ suspend_stats.last_failed_errno++;
+ suspend_stats.last_failed_errno %= REC_FAILED_NUM;
}
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
+static const char * const suspend_step_names[] = {
+ [SUSPEND_WORKING] = "",
+ [SUSPEND_FREEZE] = "freeze",
+ [SUSPEND_PREPARE] = "prepare",
+ [SUSPEND_SUSPEND] = "suspend",
+ [SUSPEND_SUSPEND_LATE] = "suspend_late",
+ [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq",
+ [SUSPEND_RESUME_NOIRQ] = "resume_noirq",
+ [SUSPEND_RESUME_EARLY] = "resume_early",
+ [SUSPEND_RESUME] = "resume",
+};
+
#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
@@ -347,20 +394,30 @@ static ssize_t _name##_show(struct kobject *kobj, \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success, "%d\n");
-suspend_attr(fail, "%d\n");
-suspend_attr(failed_freeze, "%d\n");
-suspend_attr(failed_prepare, "%d\n");
-suspend_attr(failed_suspend, "%d\n");
-suspend_attr(failed_suspend_late, "%d\n");
-suspend_attr(failed_suspend_noirq, "%d\n");
-suspend_attr(failed_resume, "%d\n");
-suspend_attr(failed_resume_early, "%d\n");
-suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(success, "%u\n");
+suspend_attr(fail, "%u\n");
suspend_attr(last_hw_sleep, "%llu\n");
suspend_attr(total_hw_sleep, "%llu\n");
suspend_attr(max_hw_sleep, "%llu\n");
+#define suspend_step_attr(_name, step) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%u\n", \
+ suspend_stats.step_failures[step-1]); \
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_step_attr(failed_freeze, SUSPEND_FREEZE);
+suspend_step_attr(failed_prepare, SUSPEND_PREPARE);
+suspend_step_attr(failed_suspend, SUSPEND_SUSPEND);
+suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE);
+suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ);
+suspend_step_attr(failed_resume, SUSPEND_RESUME);
+suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY);
+suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ);
+
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -392,16 +449,14 @@ static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
static ssize_t last_failed_step_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int index;
enum suspend_stat_step step;
- char *last_failed_step = NULL;
+ int index;
index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
index %= REC_FAILED_NUM;
step = suspend_stats.failed_steps[index];
- last_failed_step = suspend_step_name(step);
- return sprintf(buf, "%s\n", last_failed_step);
+ return sprintf(buf, "%s\n", suspend_step_names[step]);
}
static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
@@ -449,6 +504,7 @@ static const struct attribute_group suspend_attr_group = {
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
+ enum suspend_stat_step step;
last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
last_dev %= REC_FAILED_NUM;
@@ -456,47 +512,35 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
last_errno %= REC_FAILED_NUM;
last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
last_step %= REC_FAILED_NUM;
- seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
- "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
- "success", suspend_stats.success,
- "fail", suspend_stats.fail,
- "failed_freeze", suspend_stats.failed_freeze,
- "failed_prepare", suspend_stats.failed_prepare,
- "failed_suspend", suspend_stats.failed_suspend,
- "failed_suspend_late",
- suspend_stats.failed_suspend_late,
- "failed_suspend_noirq",
- suspend_stats.failed_suspend_noirq,
- "failed_resume", suspend_stats.failed_resume,
- "failed_resume_early",
- suspend_stats.failed_resume_early,
- "failed_resume_noirq",
- suspend_stats.failed_resume_noirq);
+
+ seq_printf(s, "success: %u\nfail: %u\n",
+ suspend_stats.success, suspend_stats.fail);
+
+ for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++)
+ seq_printf(s, "failed_%s: %u\n", suspend_step_names[step],
+ suspend_stats.step_failures[step-1]);
+
seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
- suspend_stats.failed_devs[last_dev]);
+ suspend_stats.failed_devs[last_dev]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_dev + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-s\n",
- suspend_stats.failed_devs[index]);
+ seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]);
}
seq_printf(s, " last_failed_errno:\t%-d\n",
suspend_stats.errno[last_errno]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_errno + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-d\n",
- suspend_stats.errno[index]);
+ seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]);
}
seq_printf(s, " last_failed_step:\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[last_step]));
+ suspend_step_names[suspend_stats.failed_steps[last_step]]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_step + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
seq_printf(s, "\t\t\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[index]));
+ suspend_step_names[suspend_stats.failed_steps[index]]);
}
return 0;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 8499a39c62f4..de0e6b1077f2 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -6,6 +6,7 @@
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
+#include <linux/crypto.h>
struct swsusp_info {
struct new_utsname uts;
@@ -54,6 +55,10 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
+extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
+/* kernel/power/swap.c */
+extern unsigned int swsusp_header_flags;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -148,7 +153,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
extern unsigned long snapshot_get_image_size(void);
extern int snapshot_read_next(struct snapshot_handle *handle);
extern int snapshot_write_next(struct snapshot_handle *handle);
-extern void snapshot_write_finalize(struct snapshot_handle *handle);
+int snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
extern bool hibernate_acquire(void);
@@ -162,11 +167,25 @@ extern int swsusp_swap_in_use(void);
* Flags that can be passed from the hibernatig hernel to the "boot" kernel in
* the image header.
*/
+#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
#define SF_HW_SIG 8
+/*
+ * Bit to indicate the compression algorithm to be used(for LZ4). The same
+ * could be checked while saving/loading image to/from disk to use the
+ * corresponding algorithms.
+ *
+ * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use
+ * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4.
+ *
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4
+ */
+#define SF_COMPRESSION_ALG_LZ4 16
+
/* kernel/power/hibernate.c */
int swsusp_check(bool exclusive);
extern void swsusp_free(void);
@@ -327,3 +346,5 @@ static inline void pm_sleep_enable_secondary_cpus(void)
suspend_enable_secondary_cpus();
cpuidle_resume();
}
+
+void dpm_save_errno(int err);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5c96ff067c64..405eddbda4fc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -58,22 +58,24 @@ static inline void hibernate_restore_protection_end(void)
hibernate_restore_protection_active = false;
}
-static inline void hibernate_restore_protect_page(void *page_address)
+static inline int __must_check hibernate_restore_protect_page(void *page_address)
{
if (hibernate_restore_protection_active)
- set_memory_ro((unsigned long)page_address, 1);
+ return set_memory_ro((unsigned long)page_address, 1);
+ return 0;
}
-static inline void hibernate_restore_unprotect_page(void *page_address)
+static inline int hibernate_restore_unprotect_page(void *page_address)
{
if (hibernate_restore_protection_active)
- set_memory_rw((unsigned long)page_address, 1);
+ return set_memory_rw((unsigned long)page_address, 1);
+ return 0;
}
#else
static inline void hibernate_restore_protection_begin(void) {}
static inline void hibernate_restore_protection_end(void) {}
-static inline void hibernate_restore_protect_page(void *page_address) {}
-static inline void hibernate_restore_unprotect_page(void *page_address) {}
+static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; }
+static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; }
#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
@@ -2832,7 +2834,9 @@ next:
}
} else {
copy_last_highmem_page();
- hibernate_restore_protect_page(handle->buffer);
+ error = hibernate_restore_protect_page(handle->buffer);
+ if (error)
+ return error;
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2858,15 +2862,18 @@ next:
* stored in highmem. Additionally, it recycles bitmap memory that's not
* necessary any more.
*/
-void snapshot_write_finalize(struct snapshot_handle *handle)
+int snapshot_write_finalize(struct snapshot_handle *handle)
{
+ int error;
+
copy_last_highmem_page();
- hibernate_restore_protect_page(handle->buffer);
+ error = hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) {
memory_bm_recycle(&orig_bm);
free_highmem_data();
}
+ return error;
}
int snapshot_image_loaded(struct snapshot_handle *handle)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index fa3bf161d13f..e3ae93bbcb9b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -192,6 +192,7 @@ static int __init mem_sleep_default_setup(char *str)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
+ mem_sleep_current = state;
break;
}
@@ -367,7 +368,6 @@ static int suspend_prepare(suspend_state_t state)
if (!error)
return 0;
- suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
pm_notifier_call_chain(PM_POST_SUSPEND);
Restore:
@@ -617,12 +617,7 @@ int pm_suspend(suspend_state_t state)
pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
- if (error) {
- suspend_stats.fail++;
- dpm_save_failed_errno(error);
- } else {
- suspend_stats.success++;
- }
+ dpm_save_errno(error);
pr_info("suspend exit\n");
return error;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index b663a97f5867..d4856ec61570 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -201,7 +201,7 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm);
if (dev) {
rtc = rtc_class_open(dev_name(dev));
put_device(dev);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 692f12fe60c1..5bc04bfe2db1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,7 +23,6 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/slab.h>
-#include <linux/lzo.h>
#include <linux/vmalloc.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
@@ -339,6 +338,13 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
return error;
}
+/*
+ * Hold the swsusp_header flag. This is used in software_resume() in
+ * 'kernel/power/hibernate' to check if the image is compressed and query
+ * for the compression algorithm support(if so).
+ */
+unsigned int swsusp_header_flags;
+
/**
* swsusp_swap_check - check if the resume device is a swap device
* and get its index (if so)
@@ -514,25 +520,30 @@ static int swap_writer_finish(struct swap_map_handle *handle,
return error;
}
+/*
+ * Bytes we need for compressed data in worst case. We assume(limitation)
+ * this is the worst of all the compression algorithms.
+ */
+#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2)
+
/* We need to remember how much compressed data we need to read. */
-#define LZO_HEADER sizeof(size_t)
+#define CMP_HEADER sizeof(size_t)
/* Number of pages/bytes we'll compress at one time. */
-#define LZO_UNC_PAGES 32
-#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
+#define UNC_PAGES 32
+#define UNC_SIZE (UNC_PAGES * PAGE_SIZE)
-/* Number of pages/bytes we need for compressed data (worst case). */
-#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
- LZO_HEADER, PAGE_SIZE)
-#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
+/* Number of pages we need for compressed data (worst case). */
+#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \
+ CMP_HEADER, PAGE_SIZE)
+#define CMP_SIZE (CMP_PAGES * PAGE_SIZE)
/* Maximum number of threads for compression/decompression. */
-#define LZO_THREADS 3
+#define CMP_THREADS 3
/* Minimum/maximum number of pages for read buffering. */
-#define LZO_MIN_RD_PAGES 1024
-#define LZO_MAX_RD_PAGES 8192
-
+#define CMP_MIN_RD_PAGES 1024
+#define CMP_MAX_RD_PAGES 8192
/**
* save_image - save the suspend image data
@@ -593,8 +604,8 @@ struct crc_data {
wait_queue_head_t go; /* start crc update */
wait_queue_head_t done; /* crc update done */
u32 *crc32; /* points to handle's crc32 */
- size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
- unsigned char *unc[LZO_THREADS]; /* uncompressed data */
+ size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */
+ unsigned char *unc[CMP_THREADS]; /* uncompressed data */
};
/*
@@ -625,10 +636,11 @@ static int crc32_threadfn(void *data)
return 0;
}
/*
- * Structure used for LZO data compression.
+ * Structure used for data compression.
*/
struct cmp_data {
struct task_struct *thr; /* thread */
+ struct crypto_comp *cc; /* crypto compressor stream */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -636,17 +648,20 @@ struct cmp_data {
wait_queue_head_t done; /* compression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
- unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
+/* Indicates the image size after compression */
+static atomic_t compressed_size = ATOMIC_INIT(0);
+
/*
* Compression function that runs in its own thread.
*/
-static int lzo_compress_threadfn(void *data)
+static int compress_threadfn(void *data)
{
struct cmp_data *d = data;
+ unsigned int cmp_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -660,9 +675,13 @@ static int lzo_compress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- d->ret = lzo1x_1_compress(d->unc, d->unc_len,
- d->cmp + LZO_HEADER, &d->cmp_len,
- d->wrk);
+ cmp_len = CMP_SIZE - CMP_HEADER;
+ d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len,
+ d->cmp + CMP_HEADER,
+ &cmp_len);
+ d->cmp_len = cmp_len;
+
+ atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len);
atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
@@ -670,14 +689,14 @@ static int lzo_compress_threadfn(void *data)
}
/**
- * save_image_lzo - Save the suspend image data compressed with LZO.
+ * save_compressed_image - Save the suspend image data after compression.
* @handle: Swap map handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
-static int save_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_write)
+static int save_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
{
unsigned int m;
int ret = 0;
@@ -694,23 +713,25 @@ static int save_image_lzo(struct swap_map_handle *handle,
hib_init_batch(&hb);
+ atomic_set(&compressed_size, 0);
+
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
- pr_err("Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
- pr_err("Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
@@ -729,7 +750,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_compress_threadfn,
+ data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(compress_threadfn,
&data[thr],
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
@@ -767,7 +795,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- pr_info("Using %u thread(s) for compression\n", nr_threads);
+ pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo);
pr_info("Compressing and saving image data (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
@@ -777,7 +805,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
- for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+ for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) {
ret = snapshot_read_next(snapshot);
if (ret < 0)
goto out_finish;
@@ -817,14 +845,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- pr_err("LZO compression failed\n");
+ pr_err("%s compression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(data[thr].unc_len))) {
- pr_err("Invalid LZO compressed length\n");
+ bytes_worst_compress(data[thr].unc_len))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -840,7 +868,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
* read it.
*/
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(page, data[thr].cmp + off, PAGE_SIZE);
@@ -862,6 +890,9 @@ out_finish:
if (!ret)
pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ pr_info("Image size after compression: %d kbytes\n",
+ (atomic_read(&compressed_size) / 1024));
+
out_clean:
hib_finish_batch(&hb);
if (crc) {
@@ -870,9 +901,12 @@ out_clean:
kfree(crc);
}
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ if (data[thr].cc)
+ crypto_free_comp(data[thr].cc);
+ }
vfree(data);
}
if (page) free_page((unsigned long)page);
@@ -942,7 +976,7 @@ int swsusp_write(unsigned int flags)
if (!error) {
error = (flags & SF_NOCOMPRESS_MODE) ?
save_image(&handle, &snapshot, pages - 1) :
- save_image_lzo(&handle, &snapshot, pages - 1);
+ save_compressed_image(&handle, &snapshot, pages - 1);
}
out_finish:
error = swap_writer_finish(&handle, flags, error);
@@ -1100,8 +1134,8 @@ static int load_image(struct swap_map_handle *handle,
ret = err2;
if (!ret) {
pr_info("Image loading done\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
@@ -1109,10 +1143,11 @@ static int load_image(struct swap_map_handle *handle,
}
/*
- * Structure used for LZO data decompression.
+ * Structure used for data decompression.
*/
struct dec_data {
struct task_struct *thr; /* thread */
+ struct crypto_comp *cc; /* crypto compressor stream */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -1120,16 +1155,17 @@ struct dec_data {
wait_queue_head_t done; /* decompression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
/*
* Decompression function that runs in its own thread.
*/
-static int lzo_decompress_threadfn(void *data)
+static int decompress_threadfn(void *data)
{
struct dec_data *d = data;
+ unsigned int unc_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -1143,9 +1179,11 @@ static int lzo_decompress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- d->unc_len = LZO_UNC_SIZE;
- d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
- d->unc, &d->unc_len);
+ unc_len = UNC_SIZE;
+ d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len,
+ d->unc, &unc_len);
+ d->unc_len = unc_len;
+
if (clean_pages_on_decompress)
flush_icache_range((unsigned long)d->unc,
(unsigned long)d->unc + d->unc_len);
@@ -1157,14 +1195,14 @@ static int lzo_decompress_threadfn(void *data)
}
/**
- * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * load_compressed_image - Load compressed image data and decompress it.
* @handle: Swap map handle to use for loading data.
* @snapshot: Image to copy uncompressed data into.
* @nr_to_read: Number of pages to load.
*/
-static int load_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_read)
+static int load_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
{
unsigned int m;
int ret = 0;
@@ -1189,18 +1227,18 @@ static int load_image_lzo(struct swap_map_handle *handle,
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
- page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page)));
+ page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page)));
if (!page) {
- pr_err("Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
- pr_err("Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
@@ -1221,7 +1259,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_decompress_threadfn,
+ data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(decompress_threadfn,
&data[thr],
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
@@ -1262,18 +1307,18 @@ static int load_image_lzo(struct swap_map_handle *handle,
*/
if (low_free_pages() > snapshot_get_image_size())
read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
- read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
+ read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
- page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+ page[i] = (void *)__get_free_page(i < CMP_PAGES ?
GFP_NOIO | __GFP_HIGH :
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (!page[i]) {
- if (i < LZO_CMP_PAGES) {
+ if (i < CMP_PAGES) {
ring_size = i;
- pr_err("Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate %s pages\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1283,7 +1328,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- pr_info("Using %u thread(s) for decompression\n", nr_threads);
+ pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo);
pr_info("Loading and decompressing image data (%u pages)...\n",
nr_to_read);
m = nr_to_read / 10;
@@ -1344,13 +1389,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].cmp_len = *(size_t *)page[pg];
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(LZO_UNC_SIZE))) {
- pr_err("Invalid LZO compressed length\n");
+ bytes_worst_compress(UNC_SIZE))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
- need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+ need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER,
PAGE_SIZE);
if (need > have) {
if (eof > 1) {
@@ -1361,7 +1406,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(data[thr].cmp + off,
page[pg], PAGE_SIZE);
@@ -1378,7 +1423,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
/*
* Wait for more data while we are decompressing.
*/
- if (have < LZO_CMP_PAGES && asked) {
+ if (have < CMP_PAGES && asked) {
ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
@@ -1396,14 +1441,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- pr_err("LZO decompression failed\n");
+ pr_err("%s decompression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
- data[thr].unc_len > LZO_UNC_SIZE ||
- data[thr].unc_len & (PAGE_SIZE - 1))) {
- pr_err("Invalid LZO uncompressed length\n");
+ data[thr].unc_len > UNC_SIZE ||
+ data[thr].unc_len & (PAGE_SIZE - 1))) {
+ pr_err("Invalid %s uncompressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -1441,8 +1486,8 @@ out_finish:
stop = ktime_get();
if (!ret) {
pr_info("Image loading done\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
@@ -1464,9 +1509,12 @@ out_clean:
kfree(crc);
}
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ if (data[thr].cc)
+ crypto_free_comp(data[thr].cc);
+ }
vfree(data);
}
vfree(page);
@@ -1500,7 +1548,7 @@ int swsusp_read(unsigned int *flags_p)
if (!error) {
error = (*flags_p & SF_NOCOMPRESS_MODE) ?
load_image(&handle, &snapshot, header->pages - 1) :
- load_image_lzo(&handle, &snapshot, header->pages - 1);
+ load_compressed_image(&handle, &snapshot, header->pages - 1);
}
swap_reader_finish(&handle);
end:
@@ -1535,6 +1583,7 @@ int swsusp_check(bool exclusive)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+ swsusp_header_flags = swsusp_header->flags;
/* Reset swap signature now */
error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3a4e70366f35..3aa41ba22129 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -317,7 +317,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_ATOMIC_RESTORE:
- snapshot_write_finalize(&data->handle);
+ error = snapshot_write_finalize(&data->handle);
+ if (error)
+ break;
if (data->mode != O_WRONLY || !data->frozen ||
!snapshot_image_loaded(&data->handle)) {
error = -EPERM;
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index b96077152f49..c8093bcc01fe 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -140,39 +140,6 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
}
-#ifdef CONFIG_64BIT
-
-#define __seq_to_nbcon_seq(seq) (seq)
-#define __nbcon_seq_to_seq(seq) (seq)
-
-#else /* CONFIG_64BIT */
-
-#define __seq_to_nbcon_seq(seq) ((u32)seq)
-
-static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
-{
- u64 seq;
- u64 rb_next_seq;
-
- /*
- * The provided sequence is only the lower 32 bits of the ringbuffer
- * sequence. It needs to be expanded to 64bit. Get the next sequence
- * number from the ringbuffer and fold it.
- *
- * Having a 32bit representation in the console is sufficient.
- * If a console ever gets more than 2^31 records behind
- * the ringbuffer then this is the least of the problems.
- *
- * Also the access to the ring buffer is always safe.
- */
- rb_next_seq = prb_next_seq(prb);
- seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
-
- return seq;
-}
-
-#endif /* CONFIG_64BIT */
-
/**
* nbcon_seq_read - Read the current console sequence
* @con: Console to read the sequence of
@@ -183,7 +150,7 @@ u64 nbcon_seq_read(struct console *con)
{
unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
- return __nbcon_seq_to_seq(nbcon_seq);
+ return __ulseq_to_u64seq(prb, nbcon_seq);
}
/**
@@ -204,7 +171,7 @@ void nbcon_seq_force(struct console *con, u64 seq)
*/
u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
- atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
/* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
con->seq = 0;
@@ -223,11 +190,11 @@ void nbcon_seq_force(struct console *con, u64 seq)
*/
static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
{
- unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
struct console *con = ctxt->console;
if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
- __seq_to_nbcon_seq(new_seq))) {
+ __u64seq_to_ulseq(new_seq))) {
ctxt->seq = new_seq;
} else {
ctxt->seq = nbcon_seq_read(con);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16..ca5146006b94 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,7 +34,7 @@
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
@@ -347,6 +347,29 @@ static bool panic_in_progress(void)
return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}
+/* Return true if a panic is in progress on the current CPU. */
+bool this_cpu_in_panic(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ return (panic_in_progress() && !this_cpu_in_panic());
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -439,12 +462,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
-/*
- * During panic, heavy printk by other CPUs can delay the
- * panic and risk deadlock on console resources.
- */
-static int __read_mostly suppress_panic_printk;
-
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -598,17 +615,6 @@ static int check_syslog_permissions(int type, int source)
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
goto ok;
- /*
- * For historical reasons, accept CAP_SYS_ADMIN too, with
- * a warning.
- */
- if (capable(CAP_SYS_ADMIN)) {
- pr_warn_once("%s (%d): Attempt to access syslog with "
- "CAP_SYS_ADMIN but no CAP_SYSLOG "
- "(deprecated).\n",
- current->comm, task_pid_nr(current));
- goto ok;
- }
return -EPERM;
}
ok:
@@ -951,7 +957,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -1846,10 +1852,23 @@ static bool console_waiter;
*/
static void console_lock_spinning_enable(void)
{
+ /*
+ * Do not use spinning in panic(). The panic CPU wants to keep the lock.
+ * Non-panic CPUs abandon the flush anyway.
+ *
+ * Just keep the lockdep annotation. The panic-CPU should avoid
+ * taking console_owner_lock because it might cause a deadlock.
+ * This looks like the easiest way how to prevent false lockdep
+ * reports without handling races a lockless way.
+ */
+ if (panic_in_progress())
+ goto lockdep;
+
raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);
+lockdep:
/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
@@ -1874,6 +1893,22 @@ static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
+ /*
+ * Ignore spinning waiters during panic() because they might get stopped
+ * or blocked at any time,
+ *
+ * It is safe because nobody is allowed to start spinning during panic
+ * in the first place. If there has been a waiter then non panic CPUs
+ * might stay spinning. They would get stopped anyway. The panic context
+ * will never start spinning and an interrupted spin on panic CPU will
+ * never continue.
+ */
+ if (panic_in_progress()) {
+ /* Keep lockdep happy. */
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+ return 0;
+ }
+
raw_spin_lock(&console_owner_lock);
waiter = READ_ONCE(console_waiter);
console_owner = NULL;
@@ -2270,8 +2305,12 @@ asmlinkage int vprintk_emit(int facility, int level,
if (unlikely(suppress_printk))
return 0;
- if (unlikely(suppress_panic_printk) &&
- atomic_read(&panic_cpu) != raw_smp_processor_id())
+ /*
+ * The messages on the panic CPU are the most important. If
+ * non-panic CPUs are generating any messages, they will be
+ * silently dropped.
+ */
+ if (other_cpu_in_panic())
return 0;
if (level == LOGLEVEL_SCHED) {
@@ -2601,26 +2640,6 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
-/*
- * Return true if a panic is in progress on a remote CPU.
- *
- * On true, the local CPU should immediately release any printing resources
- * that may be needed by the panic CPU.
- */
-bool other_cpu_in_panic(void)
-{
- if (!panic_in_progress())
- return false;
-
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
/**
* console_lock - block the console subsystem from printing
*
@@ -2776,8 +2795,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
bool is_extended, bool may_suppress)
{
- static int panic_console_dropped;
-
struct printk_buffers *pbufs = pmsg->pbufs;
const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
const size_t outbuf_sz = sizeof(pbufs->outbuf);
@@ -2805,17 +2822,6 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
pmsg->seq = r.info->seq;
pmsg->dropped = r.info->seq - seq;
- /*
- * Check for dropped messages in panic here so that printk
- * suppression can occur as early as possible if necessary.
- */
- if (pmsg->dropped &&
- panic_in_progress() &&
- panic_console_dropped++ > 10) {
- suppress_panic_printk = 1;
- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
- }
-
/* Skip record that has level above the console loglevel. */
if (may_suppress && suppress_message_printing(r.info->level))
goto out;
@@ -3263,6 +3269,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3298,8 +3319,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3325,7 +3346,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
@@ -3761,7 +3782,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
might_sleep();
- seq = prb_next_seq(prb);
+ seq = prb_next_reserve_seq(prb);
/* Flush the consoles so that records up to @seq are printed. */
console_lock();
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index fde338606ce8..88e8f3a61922 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -6,6 +6,7 @@
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
+#include "internal.h"
/**
* DOC: printk_ringbuffer overview
@@ -303,6 +304,9 @@
*
* desc_push_tail:B / desc_reserve:D
* set descriptor reusable (state), then push descriptor tail (id)
+ *
+ * desc_update_last_finalized:A / desc_last_finalized_seq:A
+ * store finalized record, then set new highest finalized sequence number
*/
#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
@@ -1030,9 +1034,13 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
unsigned long next_lpos;
if (size == 0) {
- /* Specify a data-less block. */
- blk_lpos->begin = NO_LPOS;
- blk_lpos->next = NO_LPOS;
+ /*
+ * Data blocks are not created for empty lines. Instead, the
+ * reader will recognize these special lpos values and handle
+ * it appropriately.
+ */
+ blk_lpos->begin = EMPTY_LINE_LPOS;
+ blk_lpos->next = EMPTY_LINE_LPOS;
return NULL;
}
@@ -1210,10 +1218,18 @@ static const char *get_data(struct prb_data_ring *data_ring,
/* Data-less data block description. */
if (BLK_DATALESS(blk_lpos)) {
- if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) {
+ /*
+ * Records that are just empty lines are also valid, even
+ * though they do not have a data block. For such records
+ * explicitly return empty string data to signify success.
+ */
+ if (blk_lpos->begin == EMPTY_LINE_LPOS &&
+ blk_lpos->next == EMPTY_LINE_LPOS) {
*data_size = 0;
return "";
}
+
+ /* Data lost, invalid, or otherwise unavailable. */
return NULL;
}
@@ -1442,19 +1458,117 @@ fail_reopen:
}
/*
+ * @last_finalized_seq value guarantees that all records up to and including
+ * this sequence number are finalized and can be read. The only exception are
+ * too old records which have already been overwritten.
+ *
+ * It is also guaranteed that @last_finalized_seq only increases.
+ *
+ * Be aware that finalized records following non-finalized records are not
+ * reported because they are not yet available to the reader. For example,
+ * a new record stored via printk() will not be available to a printer if
+ * it follows a record that has not been finalized yet. However, once that
+ * non-finalized record becomes finalized, @last_finalized_seq will be
+ * appropriately updated and the full set of finalized records will be
+ * available to the printer. And since each printk() caller will either
+ * directly print or trigger deferred printing of all available unprinted
+ * records, all printk() messages will get printed.
+ */
+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long ulseq;
+
+ /*
+ * Guarantee the sequence number is loaded before loading the
+ * associated record in order to guarantee that the record can be
+ * seen by this CPU. This pairs with desc_update_last_finalized:A.
+ */
+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
+ ); /* LMM(desc_last_finalized_seq:A) */
+
+ return __ulseq_to_u64seq(rb, ulseq);
+}
+
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count);
+
+/*
+ * Check if there are records directly following @last_finalized_seq that are
+ * finalized. If so, update @last_finalized_seq to the latest of these
+ * records. It is not allowed to skip over records that are not yet finalized.
+ */
+static void desc_update_last_finalized(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ u64 old_seq = desc_last_finalized_seq(rb);
+ unsigned long oldval;
+ unsigned long newval;
+ u64 finalized_seq;
+ u64 try_seq;
+
+try_again:
+ finalized_seq = old_seq;
+ try_seq = finalized_seq + 1;
+
+ /* Try to find later finalized records. */
+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
+ finalized_seq = try_seq;
+ try_seq++;
+ }
+
+ /* No update needed if no later finalized record was found. */
+ if (finalized_seq == old_seq)
+ return;
+
+ oldval = __u64seq_to_ulseq(old_seq);
+ newval = __u64seq_to_ulseq(finalized_seq);
+
+ /*
+ * Set the sequence number of a later finalized record that has been
+ * seen.
+ *
+ * Guarantee the record data is visible to other CPUs before storing
+ * its sequence number. This pairs with desc_last_finalized_seq:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then desc_read:A reads from
+ * _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
+ *
+ * Note: _prb_commit:B and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A
+ * CPU (which performs the release) must have previously seen
+ * _prb_commit:B.
+ */
+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
+ old_seq = __ulseq_to_u64seq(rb, oldval);
+ goto try_again;
+ }
+}
+
+/*
* Attempt to finalize a specified descriptor. If this fails, the descriptor
* is either already final or it will finalize itself when the writer commits.
*/
-static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
unsigned long prev_state_val = DESC_SV(id, desc_committed);
struct prb_desc *d = to_desc(desc_ring, id);
- atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
- DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
-
- /* Best effort to remember the last finalized @id. */
- atomic_long_set(&desc_ring->last_finalized_id, id);
+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
+ desc_update_last_finalized(rb);
+ }
}
/**
@@ -1550,7 +1664,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
* readers. (For seq==0 there is no previous descriptor.)
*/
if (info->seq > 0)
- desc_make_final(desc_ring, DESC_ID(id - 1));
+ desc_make_final(rb, DESC_ID(id - 1));
r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
/* If text data allocation fails, a data-less record is committed. */
@@ -1643,7 +1757,7 @@ void prb_commit(struct prb_reserved_entry *e)
*/
head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
if (head_id != e->id)
- desc_make_final(desc_ring, e->id);
+ desc_make_final(e->rb, e->id);
}
/**
@@ -1663,12 +1777,9 @@ void prb_commit(struct prb_reserved_entry *e)
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
- struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
-
_prb_commit(e, desc_finalized);
- /* Best effort to remember the last finalized @id. */
- atomic_long_set(&desc_ring->last_finalized_id, e->id);
+ desc_update_last_finalized(e->rb);
}
/*
@@ -1832,7 +1943,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq,
}
/* Get the sequence number of the tail descriptor. */
-static u64 prb_first_seq(struct printk_ringbuffer *rb)
+u64 prb_first_seq(struct printk_ringbuffer *rb)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
enum desc_state d_state;
@@ -1875,12 +1986,123 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb)
return seq;
}
+/**
+ * prb_next_reserve_seq() - Get the sequence number after the most recently
+ * reserved record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what sequence
+ * number will be assigned to the next reserved record.
+ *
+ * Note that depending on the situation, this value can be equal to or
+ * higher than the sequence number returned by prb_next_seq().
+ *
+ * Context: Any context.
+ * Return: The sequence number that will be assigned to the next record
+ * reserved.
+ */
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long last_finalized_id;
+ atomic_long_t *state_var;
+ u64 last_finalized_seq;
+ unsigned long head_id;
+ struct prb_desc desc;
+ unsigned long diff;
+ struct prb_desc *d;
+ int err;
+
+ /*
+ * It may not be possible to read a sequence number for @head_id.
+ * So the ID of @last_finailzed_seq is used to calculate what the
+ * sequence number of @head_id will be.
+ */
+
+try_again:
+ last_finalized_seq = desc_last_finalized_seq(rb);
+
+ /*
+ * @head_id is loaded after @last_finalized_seq to ensure that
+ * it points to the record with @last_finalized_seq or newer.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then
+ * prb_next_reserve_seq:A reads from desc_reserve:D.
+ *
+ * Relies on:
+ *
+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
+ *
+ * Note: desc_reserve:D and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A CPU
+ * (which performs the release) must have previously seen
+ * desc_read:C, which implies desc_reserve:D can be seen.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */
+
+ d = to_desc(desc_ring, last_finalized_seq);
+ state_var = &d->state_var;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ last_finalized_id = DESC_ID(atomic_long_read(state_var));
+
+ /* Ensure @last_finalized_id is correct. */
+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);
+
+ if (err == -EINVAL) {
+ if (last_finalized_seq == 0) {
+ /*
+ * No record has been finalized or even reserved yet.
+ *
+ * The @head_id is initialized such that the first
+ * increment will yield the first record (seq=0).
+ * Handle it separately to avoid a negative @diff
+ * below.
+ */
+ if (head_id == DESC0_ID(desc_ring->count_bits))
+ return 0;
+
+ /*
+ * One or more descriptors are already reserved. Use
+ * the descriptor ID of the first one (@seq=0) for
+ * the @diff below.
+ */
+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
+ } else {
+ /* Record must have been overwritten. Try again. */
+ goto try_again;
+ }
+ }
+
+ /* Diff of known descriptor IDs to compute related sequence numbers. */
+ diff = head_id - last_finalized_id;
+
+ /*
+ * @head_id points to the most recently reserved record, but this
+ * function returns the sequence number that will be assigned to the
+ * next (not yet reserved) record. Thus +1 is needed.
+ */
+ return (last_finalized_seq + diff + 1);
+}
+
/*
- * Non-blocking read of a record. Updates @seq to the last finalized record
- * (which may have no data available).
+ * Non-blocking read of a record.
+ *
+ * On success @seq is updated to the record that was read and (if provided)
+ * @r and @line_count will contain the read/calculated data.
+ *
+ * On failure @seq is updated to a record that is not yet available to the
+ * reader, but it will be the next record available to the reader.
*
- * See the description of prb_read_valid() and prb_read_valid_info()
- * for details.
+ * Note: When the current CPU is in panic, this function will skip over any
+ * non-existent/non-finalized records in order to allow the panic CPU
+ * to print any and all records that have been finalized.
*/
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
struct printk_record *r, unsigned int *line_count)
@@ -1899,12 +2121,32 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
*seq = tail_seq;
} else if (err == -ENOENT) {
- /* Record exists, but no data available. Skip. */
+ /* Record exists, but the data was lost. Skip. */
(*seq)++;
} else {
- /* Non-existent/non-finalized record. Must stop. */
- return false;
+ /*
+ * Non-existent/non-finalized record. Must stop.
+ *
+ * For panic situations it cannot be expected that
+ * non-finalized records will become finalized. But
+ * there may be other finalized records beyond that
+ * need to be printed for a panic situation. If this
+ * is the panic CPU, skip this
+ * non-existent/non-finalized record unless it is
+ * at or beyond the head, in which case it is not
+ * possible to continue.
+ *
+ * Note that new messages printed on panic CPU are
+ * finalized when we are here. The only exception
+ * might be the last message without trailing newline.
+ * But it would have the sequence number returned
+ * by "prb_next_reserve_seq() - 1".
+ */
+ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb)))
+ (*seq)++;
+ else
+ return false;
}
}
@@ -1932,7 +2174,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* On success, the reader must check r->info.seq to see which record was
* actually read. This allows the reader to detect dropped records.
*
- * Failure means @seq refers to a not yet written record.
+ * Failure means @seq refers to a record not yet available to the reader.
*/
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
struct printk_record *r)
@@ -1962,7 +2204,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
* On success, the reader must check info->seq to see which record meta data
* was actually read. This allows the reader to detect dropped records.
*
- * Failure means @seq refers to a not yet written record.
+ * Failure means @seq refers to a record not yet available to the reader.
*/
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count)
@@ -2008,7 +2250,9 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
* newest sequence number available to readers will be.
*
* This provides readers a sequence number to jump to if all currently
- * available records should be skipped.
+ * available records should be skipped. It is guaranteed that all records
+ * previous to the returned value have been finalized and are (or were)
+ * available to the reader.
*
* Context: Any context.
* Return: The sequence number of the next newest (not yet available) record
@@ -2016,34 +2260,19 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
- struct prb_desc_ring *desc_ring = &rb->desc_ring;
- enum desc_state d_state;
- unsigned long id;
u64 seq;
- /* Check if the cached @id still points to a valid @seq. */
- id = atomic_long_read(&desc_ring->last_finalized_id);
- d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
+ seq = desc_last_finalized_seq(rb);
- if (d_state == desc_finalized || d_state == desc_reusable) {
- /*
- * Begin searching after the last finalized record.
- *
- * On 0, the search must begin at 0 because of hack#2
- * of the bootstrapping phase it is not known if a
- * record at index 0 exists.
- */
- if (seq != 0)
- seq++;
- } else {
- /*
- * The information about the last finalized sequence number
- * has gone. It should happen only when there is a flood of
- * new messages and the ringbuffer is rapidly recycled.
- * Give up and start from the beginning.
- */
- seq = 0;
- }
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
/*
* The information about the last finalized @seq might be inaccurate.
@@ -2085,7 +2314,7 @@ void prb_init(struct printk_ringbuffer *rb,
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
- atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 18cd25e489b8..52626d0f1fa3 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,7 +75,7 @@ struct prb_desc_ring {
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
- atomic_long_t last_finalized_id;
+ atomic_long_t last_finalized_seq;
};
/*
@@ -127,8 +127,22 @@ enum desc_state {
#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
#define DESC_ID_MASK (~DESC_FLAGS_MASK)
#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
+
+/*
+ * Special data block logical position values (for fields of
+ * @prb_desc.text_blk_lpos).
+ *
+ * - Bit0 is used to identify if the record has no data block. (Implemented in
+ * the LPOS_DATALESS() macro.)
+ *
+ * - Bit1 specifies the reason for not having a data block.
+ *
+ * These special values could never be real lpos values because of the
+ * meta data and alignment padding of data blocks. (See to_blk_size() for
+ * details.)
+ */
#define FAILED_LPOS 0x1
-#define NO_LPOS 0x3
+#define EMPTY_LINE_LPOS 0x3
#define FAILED_BLK_LPOS \
{ \
@@ -259,7 +273,7 @@ static struct printk_ringbuffer name = { \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
- .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_seq = ATOMIC_INIT(0), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
@@ -378,7 +392,41 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count);
+u64 prb_first_seq(struct printk_ringbuffer *rb);
u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
u64 prb_next_seq(struct printk_ringbuffer *rb);
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);
+
+#ifdef CONFIG_64BIT
+
+#define __u64seq_to_ulseq(u64seq) (u64seq)
+#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
+
+#else /* CONFIG_64BIT */
+
+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
+
+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
+{
+ u64 rb_first_seq = prb_first_seq(rb);
+ u64 seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the first sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2fabd497d659..d5f89f9ef29f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data)
return 0;
}
-static inline void ptrace_set_stopped(struct task_struct *task)
+static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
{
guard(spinlock)(&task->sighand->siglock);
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
/*
* If the task is already STOPPED, set JOBCTL_TRAP_STOP and
* TRAPPING, and kick it so that it transits to TRACED. TRAPPING
@@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request,
return -EPERM;
task->ptrace = flags;
-
ptrace_link(task, current);
-
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
-
- ptrace_set_stopped(task);
+ ptrace_set_stopped(task, seize);
}
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 929fce69f555..0621e4ee31de 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6647,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
- * after coming from user-space, before storing to rq->curr.
+ * after coming from user-space, before storing to rq->curr; this
+ * barrier matches a full barrier in the proximity of the membarrier
+ * system call exit.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -6718,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* Here are the schemes providing that barrier on the
* various architectures:
- * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
- * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+ * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
+ * on PowerPC and on RISC-V.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
+ *
+ * The barrier matches a full barrier in the proximity of
+ * the membarrier system call entry.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
++*switch_count;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8e50fbac345..e8270e2e15cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1831,6 +1831,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int last_cpupid, this_cpupid;
/*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 4e715b9b278e..809194cd779f 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -254,7 +254,7 @@ static int membarrier_global_expedited(void)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
@@ -304,7 +304,7 @@ static int membarrier_global_expedited(void)
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
@@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
+ prepare_sync_core_cmd(mm);
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
smp_mb(); /* system call entry is not a mb. */
@@ -420,7 +425,7 @@ out:
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
diff --git a/kernel/signal.c b/kernel/signal.c
index bdca529f0f7b..7bdbcf1b78d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2741,12 +2741,15 @@ relock:
/* Has this task already been marked for death? */
if ((signal->flags & SIGNAL_GROUP_EXIT) ||
signal->group_exec_task) {
- clear_siginfo(&ksig->info);
- ksig->info.si_signo = signr = SIGKILL;
+ signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
- &sighand->action[SIGKILL - 1]);
+ &sighand->action[SIGKILL-1]);
recalc_sigpending();
+ /*
+ * implies do_group_exit() or return to PF_USER_WORKER,
+ * no need to initialize ksig->info/etc.
+ */
goto fatal;
}
@@ -2856,7 +2859,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(ksig->info.si_signo))) {
+ if (likely(do_signal_stop(signr))) {
/* It released the siglock. */
goto relock;
}
@@ -2880,7 +2883,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(ksig->info.si_signo);
+ print_fatal_signal(signr);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2895,8 +2898,9 @@ relock:
/*
* PF_USER_WORKER threads will catch and exit on fatal signals
- * themselves. They have cleanup that must be performed, so
- * we cannot call do_exit() on their behalf.
+ * themselves. They have cleanup that must be performed, so we
+ * cannot call do_exit() on their behalf. Note that ksig won't
+ * be properly initialized, PF_USER_WORKER's shouldn't use it.
*/
if (current->flags & PF_USER_WORKER)
goto out;
@@ -2904,17 +2908,17 @@ relock:
/*
* Death signals, no core dump.
*/
- do_group_exit(ksig->info.si_signo);
+ do_group_exit(signr);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
-out:
+
ksig->sig = signr;
- if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
+ if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
hide_si_addr_tag_bits(ksig);
-
- return ksig->sig > 0;
+out:
+ return signr > 0;
}
/**
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 157f7ce2942d..81cc974913bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1710,9 +1710,9 @@ static struct ctl_table kern_table[] = {
{
.procname = "ftrace_dump_on_oops",
.data = &ftrace_dump_on_oops,
- .maxlen = sizeof(int),
+ .maxlen = MAX_TRACER_SIZE,
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dostring,
},
{
.procname = "traceoff_on_warning",
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4657cb8e8b1f..5abfa4390673 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -134,7 +134,7 @@ static struct class_interface alarmtimer_rtc_interface = {
static int alarmtimer_rtc_interface_setup(void)
{
- alarmtimer_rtc_interface.class = rtc_class;
+ alarmtimer_rtc_interface.class = &rtc_class;
return class_interface_register(&alarmtimer_rtc_interface);
}
static void alarmtimer_rtc_interface_remove(void)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ff49ddcc9800..3b57a73f249a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -642,7 +642,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* the base lock:
*/
if (base->is_idle) {
- WARN_ON_ONCE(!(timer->flags & TIMER_PINNED));
+ WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
+ tick_nohz_full_cpu(base->cpu)));
wake_up_nohz_cpu(base->cpu);
}
}
@@ -2292,6 +2293,13 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
*/
if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
base_local->is_idle = true;
+ /*
+ * Global timers queued locally while running in a task
+ * in nohz_full mode need a self-IPI to kick reprogramming
+ * in IRQ tail.
+ */
+ if (tick_nohz_full_cpu(base_local->cpu))
+ base_global->is_idle = true;
trace_timer_base_idle(true, base_local->cpu);
}
*idle = base_local->is_idle;
@@ -2364,6 +2372,8 @@ void timer_clear_idle(void)
* path. Required for BASE_LOCAL only.
*/
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
trace_timer_base_idle(false, smp_processor_id());
/* Activate without holding the timer_base->lock */
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 8f49b6b96dfd..c63a0afdcebe 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -751,26 +751,6 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
first_childevt = evt = data->evt;
- /*
- * Walking the hierarchy is required in any case when a
- * remote expiry was done before. This ensures to not lose
- * already queued events in non active groups (see section
- * "Required event and timerqueue update after a remote
- * expiry" in the documentation at the top).
- *
- * The two call sites which are executed without a remote expiry
- * before, are not prevented from propagating changes through
- * the hierarchy by the return:
- * - When entering this path by tmigr_new_timer(), @evt->ignore
- * is never set.
- * - tmigr_inactive_up() takes care of the propagation by
- * itself and ignores the return value. But an immediate
- * return is required because nothing has to be done in this
- * level as the event could be ignored.
- */
- if (evt->ignore && !remote)
- return true;
-
raw_spin_lock(&group->lock);
childstate.state = 0;
@@ -1058,8 +1038,15 @@ void tmigr_handle_remote(void)
* in tmigr_handle_remote_up() anyway. Keep this check to speed up the
* return when nothing has to be done.
*/
- if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask))
- return;
+ if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) {
+ /*
+ * If this CPU was an idle migrator, make sure to clear its wakeup
+ * value so it won't chase timers that have already expired elsewhere.
+ * This avoids endless requeue from tmigr_new_timer().
+ */
+ if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
+ return;
+ }
data.now = get_jiffies_update(&data.basej);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7ac6c52b25eb..0a5c4efc73c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1412,14 +1412,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
__bpf_kfunc_end_defs();
-BTF_SET8_START(key_sig_kfunc_set)
+BTF_KFUNCS_START(key_sig_kfunc_set)
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
-BTF_SET8_END(key_sig_kfunc_set)
+BTF_KFUNCS_END(key_sig_kfunc_set)
static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
.owner = THIS_MODULE,
@@ -1475,9 +1475,9 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
__bpf_kfunc_end_defs();
-BTF_SET8_START(fs_kfunc_set_ids)
+BTF_KFUNCS_START(fs_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_SET8_END(fs_kfunc_set_ids)
+BTF_KFUNCS_END(fs_kfunc_set_ids)
static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
@@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2679,6 +2679,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
+ u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
struct bpf_kprobe_multi_link *kmulti_link;
u32 ucount = info->kprobe_multi.count;
@@ -2686,6 +2687,8 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
if (!uaddrs ^ !ucount)
return -EINVAL;
+ if (ucookies && !ucount)
+ return -EINVAL;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
@@ -2699,6 +2702,18 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
else
ucount = kmulti_link->cnt;
+ if (ucookies) {
+ if (kmulti_link->cookies) {
+ if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
+ return -EFAULT;
+ } else {
+ for (i = 0; i < ucount; i++) {
+ if (put_user(0, ucookies + i))
+ return -EFAULT;
+ }
+ }
+ }
+
if (kallsyms_show_value(current_cred())) {
if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
return -EFAULT;
@@ -3241,7 +3256,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
.uprobe = uprobe,
};
struct bpf_prog *prog = link->link.prog;
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
int err = 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83ba342aef31..da1710499698 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1160,7 +1160,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
* Search a given @hash to see if a given instruction pointer (@ip)
* exists in it.
*
- * Returns the entry that holds the @ip if found. NULL otherwise.
+ * Returns: the entry that holds the @ip if found. NULL otherwise.
*/
struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1282,7 +1282,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
/**
* ftrace_free_filter - remove all filters for an ftrace_ops
- * @ops - the ops to remove the filters from
+ * @ops: the ops to remove the filters from
*/
void ftrace_free_filter(struct ftrace_ops *ops)
{
@@ -1587,7 +1587,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
* @end: end of range to search (inclusive). @end points to the last byte
* to check.
*
- * Returns rec->ip if the related ftrace location is a least partly within
+ * Returns: rec->ip if the related ftrace location is a least partly within
* the given address range. That is, the first address of the instruction
* that is either a NOP or call to the function tracer. It checks the ftrace
* internal tables to determine if the address belongs or not.
@@ -1607,9 +1607,10 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
* ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * If @ip matches the ftrace location, return @ip.
- * If @ip matches sym+0, return sym's ftrace location.
- * Otherwise, return 0.
+ * Returns:
+ * * If @ip matches the ftrace location, return @ip.
+ * * If @ip matches sym+0, return sym's ftrace location.
+ * * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
@@ -1639,7 +1640,7 @@ out:
* @start: start of range to search
* @end: end of range to search (inclusive). @end points to the last byte to check.
*
- * Returns 1 if @start and @end contains a ftrace location.
+ * Returns: 1 if @start and @end contains a ftrace location.
* That is, the instruction that is either a NOP or call to
* the function tracer. It checks the ftrace internal tables to
* determine if the address belongs or not.
@@ -2574,7 +2575,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
* is not set, then it wants to convert to the normal callback.
*
- * Returns the address of the trampoline to set to
+ * Returns: the address of the trampoline to set to
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
@@ -2615,7 +2616,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
* a function that saves all the regs. Basically the '_EN' version
* represents the current state of the function.
*
- * Returns the address of the trampoline that is currently being called
+ * Returns: the address of the trampoline that is currently being called
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
@@ -2719,7 +2720,7 @@ struct ftrace_rec_iter {
/**
* ftrace_rec_iter_start - start up iterating over traced functions
*
- * Returns an iterator handle that is used to iterate over all
+ * Returns: an iterator handle that is used to iterate over all
* the records that represent address locations where functions
* are traced.
*
@@ -2751,7 +2752,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
* ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
- * Returns the next iterator after the given iterator @iter.
+ * Returns: the next iterator after the given iterator @iter.
*/
struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
{
@@ -2776,7 +2777,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
* ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
- * Returns the record that the current @iter is at.
+ * Returns: the record that the current @iter is at.
*/
struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
{
@@ -4010,6 +4011,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file)
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
* tracing_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
+ *
+ * Returns: 0 on success or a negative errno value on failure
*/
int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
@@ -4626,7 +4629,7 @@ struct ftrace_func_mapper {
/**
* allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
*
- * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
+ * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data.
*/
struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
{
@@ -4646,7 +4649,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
* @mapper: The mapper that has the ip maps
* @ip: the instruction pointer to find the data for
*
- * Returns the data mapped to @ip if found otherwise NULL. The return
+ * Returns: the data mapped to @ip if found otherwise NULL. The return
* is actually the address of the mapper data pointer. The address is
* returned for use cases where the data is no bigger than a long, and
* the user can use the data pointer as its data instead of having to
@@ -4672,7 +4675,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to map @data to
* @data: The data to map to @ip
*
- * Returns 0 on success otherwise an error.
+ * Returns: 0 on success otherwise an error.
*/
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
unsigned long ip, void *data)
@@ -4701,7 +4704,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @mapper: The mapper that has the ip maps
* @ip: The instruction pointer address to remove the data from
*
- * Returns the data if it is found, otherwise NULL.
+ * Returns: the data if it is found, otherwise NULL.
* Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
@@ -5625,10 +5628,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
/**
* ftrace_set_filter_ip - set a function to filter on in ftrace by address
- * @ops - the ops to set the filter with
- * @ip - the address to add to or remove from the filter.
- * @remove - non zero to remove the ip from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ip: the address to add to or remove from the filter.
+ * @remove: non zero to remove the ip from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
@@ -5647,11 +5650,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
* ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
- * @ops - the ops to set the filter with
- * @ips - the array of addresses to add to or remove from the filter.
- * @cnt - the number of addresses in @ips
- * @remove - non zero to remove ips from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ips: the array of addresses to add to or remove from the filter.
+ * @cnt: the number of addresses in @ips
+ * @remove: non zero to remove ips from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
@@ -5670,7 +5673,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
/**
* ftrace_ops_set_global_filter - setup ops to use global filters
- * @ops - the ops which will use the global filters
+ * @ops: the ops which will use the global filters
*
* ftrace users who need global function trace filtering should call this.
* It can set the global filter only if ops were not initialized before.
@@ -5694,10 +5697,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
/**
* ftrace_set_filter - set a function to filter on in ftrace
- * @ops - the ops to set the filter with
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5716,10 +5719,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
/**
* ftrace_set_notrace - set a function to not trace in ftrace
- * @ops - the ops to set the notrace filter with
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the notrace filter with
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -5738,9 +5741,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
* ftrace_set_global_filter - set a function to filter on with global tracers
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5753,9 +5756,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
* ftrace_set_global_notrace - set a function to not trace with global tracers
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -7443,7 +7446,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
* have its own recursion protection, then it should call the
* ftrace_ops_assist_func() instead.
*
- * Returns the function that the trampoline should call for @ops.
+ * Returns: the function that the trampoline should call for @ops.
*/
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
@@ -7897,7 +7900,7 @@ void ftrace_kill(void)
/**
* ftrace_is_dead - Test if ftrace is dead or not.
*
- * Returns 1 if ftrace is "dead", zero otherwise.
+ * Returns: 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
@@ -8142,8 +8145,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
- * This function returns 0 if all provided symbols are found,
- * -ESRCH otherwise.
+ * Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
{
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3103a484182e..25476ead681b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -384,6 +384,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ atomic_t seq;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -753,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+ /* For waiters waiting for the first wake up */
+ (void)atomic_fetch_inc_release(&rbwork->seq);
+
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
/* Only cpu_buffer sets the above flags */
@@ -834,51 +838,24 @@ static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
ret = !pagebusy && full_hit(buffer, cpu, full);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
+ if (!ret && (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)) {
+ cpu_buffer->shortest_full = full;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
return ret;
}
-/**
- * ring_buffer_wait - wait for input to the ring buffer
- * @buffer: buffer to wait on
- * @cpu: the cpu buffer to wait on
- * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
- *
- * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
- * as data is added to any of the @buffer's cpu buffers. Otherwise
- * it will wait for data to be added to a specific cpu buffer.
- */
-int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
+static inline bool
+rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer,
+ int cpu, int full, ring_buffer_cond_fn cond, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer;
- DEFINE_WAIT(wait);
- struct rb_irq_work *work;
- int ret = 0;
-
- /*
- * Depending on what the caller is waiting for, either any
- * data in any cpu buffer, or a specific buffer, put the
- * caller on the appropriate wait queue.
- */
- if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
- /* Full only makes sense on per cpu reads */
- full = 0;
- } else {
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -ENODEV;
- cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
- }
+ if (rb_watermark_hit(buffer, cpu, full))
+ return true;
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ if (cond(data))
+ return true;
/*
* The events can happen in critical sections where
@@ -901,27 +878,82 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* a task has been queued. It's OK for spurious wake ups.
*/
if (full)
- work->full_waiters_pending = true;
+ rbwork->full_waiters_pending = true;
else
- work->waiters_pending = true;
+ rbwork->waiters_pending = true;
- if (rb_watermark_hit(buffer, cpu, full))
- goto out;
+ return false;
+}
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
+struct rb_wait_data {
+ struct rb_irq_work *irq_work;
+ int seq;
+};
+
+/*
+ * The default wait condition for ring_buffer_wait() is to just to exit the
+ * wait loop the first time it is woken up.
+ */
+static bool rb_wait_once(void *data)
+{
+ struct rb_wait_data *rdata = data;
+ struct rb_irq_work *rbwork = rdata->irq_work;
+
+ return atomic_read_acquire(&rbwork->seq) != rdata->seq;
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @cond: condition function to break out of wait (NULL to run once)
+ * @data: the data to pass to @cond.
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
+ ring_buffer_cond_fn cond, void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct wait_queue_head *waitq;
+ struct rb_irq_work *rbwork;
+ struct rb_wait_data rdata;
+ int ret = 0;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ rbwork = &buffer->irq_work;
+ /* Full only makes sense on per cpu reads */
+ full = 0;
+ } else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
+ cpu_buffer = buffer->buffers[cpu];
+ rbwork = &cpu_buffer->irq_work;
}
- schedule();
- out:
if (full)
- finish_wait(&work->full_waiters, &wait);
+ waitq = &rbwork->full_waiters;
else
- finish_wait(&work->waiters, &wait);
+ waitq = &rbwork->waiters;
+
+ /* Set up to exit loop as soon as it is woken */
+ if (!cond) {
+ cond = rb_wait_once;
+ rdata.irq_work = rbwork;
+ rdata.seq = atomic_read_acquire(&rbwork->seq);
+ data = &rdata;
+ }
- if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
- ret = -EINTR;
+ ret = wait_event_interruptible((*waitq),
+ rb_wait_cond(rbwork, buffer, cpu, full, cond, data));
return ret;
}
@@ -959,21 +991,30 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
}
if (full) {
- unsigned long flags;
-
poll_wait(filp, &rbwork->full_waiters, poll_table);
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ if (rb_watermark_hit(buffer, cpu, full))
+ return EPOLLIN | EPOLLRDNORM;
+ /*
+ * Only allow full_waiters_pending update to be seen after
+ * the shortest_full is set (in rb_watermark_hit). If the
+ * writer sees the full_waiters_pending flag set, it will
+ * compare the amount in the ring buffer to shortest_full.
+ * If the amount in the ring buffer is greater than the
+ * shortest_full percent, it will call the irq_work handler
+ * to wake up this list. The irq_handler will reset shortest_full
+ * back to zero. That's done under the reader_lock, but
+ * the below smp_mb() makes sure that the update to
+ * full_waiters_pending doesn't leak up into the above.
+ */
+ smp_mb();
rbwork->full_waiters_pending = true;
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- } else {
- poll_wait(filp, &rbwork->waiters, poll_table);
- rbwork->waiters_pending = true;
+ return 0;
}
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -989,9 +1030,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
- if (full)
- return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
-
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -1485,7 +1523,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
@@ -1570,7 +1609,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -4350,7 +4390,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
- commit_page = cpu_buffer->commit_page;
+ commit_page = READ_ONCE(cpu_buffer->commit_page);
commit_ts = commit_page->page->time_stamp;
/*
@@ -5538,7 +5578,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page) {
kfree(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c9c898307348..233d1af39fff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -13,7 +13,7 @@
* Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/ring_buffer.h>
-#include <generated/utsrelease.h>
+#include <linux/utsname.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
@@ -39,7 +39,6 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
-#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -105,7 +104,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_taskinfo_save);
+DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -131,9 +130,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* /proc/sys/kernel/ftrace_dump_on_oops
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
+ * Set instance name if you want to dump the specific trace instance
+ * Multiple instance dump is also supported, and instances are seperated
+ * by commas.
*/
-
-enum ftrace_dump_mode ftrace_dump_on_oops;
+/* Set to string format zero to disable by default */
+char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
@@ -179,7 +181,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
unsigned int trace_ctx);
-#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
@@ -202,19 +203,33 @@ static int __init set_cmdline_ftrace(char *str)
}
__setup("ftrace=", set_cmdline_ftrace);
+int ftrace_dump_on_oops_enabled(void)
+{
+ if (!strcmp("0", ftrace_dump_on_oops))
+ return 0;
+ else
+ return 1;
+}
+
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str || !strcmp("1", str)) {
- ftrace_dump_on_oops = DUMP_ALL;
+ if (!*str) {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
return 1;
}
- if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
- ftrace_dump_on_oops = DUMP_ORIG;
- return 1;
- }
+ if (*str == ',') {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
+ strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1);
+ return 1;
+ }
- return 0;
+ if (*str++ == '=') {
+ strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE);
+ return 1;
+ }
+
+ return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -1301,6 +1316,50 @@ static void free_snapshot(struct trace_array *tr)
tr->allocated_snapshot = false;
}
+static int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_arm_snapshot_locked(tr);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
@@ -1374,10 +1433,6 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
mutex_lock(&trace_types_lock);
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret)
- goto fail_unlock;
-
if (tr->current_trace->use_max_tr) {
ret = -EBUSY;
goto fail_unlock;
@@ -1396,6 +1451,10 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ goto fail_unlock;
+
local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
@@ -1440,6 +1499,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
+ tracing_disarm_snapshot(tr);
+
return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
@@ -1482,6 +1543,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#define free_snapshot(tr) do { } while (0)
+#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1955,15 +2017,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
#endif /* CONFIG_TRACER_MAX_TRACE */
+struct pipe_wait {
+ struct trace_iterator *iter;
+ int wait_index;
+};
+
+static bool wait_pipe_cond(void *data)
+{
+ struct pipe_wait *pwait = data;
+ struct trace_iterator *iter = pwait->iter;
+
+ if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index)
+ return true;
+
+ return iter->closed;
+}
+
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ struct pipe_wait pwait;
int ret;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+ pwait.wait_index = atomic_read_acquire(&iter->wait_index);
+ pwait.iter = iter;
+
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
+ wait_pipe_cond, &pwait);
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -2299,98 +2382,6 @@ void tracing_reset_all_online_cpus(void)
mutex_unlock(&trace_types_lock);
}
-/*
- * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
- * is the tgid last observed corresponding to pid=i.
- */
-static int *tgid_map;
-
-/* The maximum valid index into tgid_map. */
-static size_t tgid_map_max;
-
-#define SAVED_CMDLINES_DEFAULT 128
-#define NO_CMDLINE_MAP UINT_MAX
-/*
- * Preemption must be disabled before acquiring trace_cmdline_lock.
- * The various trace_arrays' max_lock must be acquired in a context
- * where interrupt is disabled.
- */
-static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-struct saved_cmdlines_buffer {
- unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
- unsigned *map_cmdline_to_pid;
- unsigned cmdline_num;
- int cmdline_idx;
- char saved_cmdlines[];
-};
-static struct saved_cmdlines_buffer *savedcmd;
-
-static inline char *get_saved_cmdlines(int idx)
-{
- return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
-}
-
-static inline void set_cmdline(int idx, const char *cmdline)
-{
- strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
-}
-
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
-
- kfree(s->map_cmdline_to_pid);
- kmemleak_free(s);
- free_pages((unsigned long)s, order);
-}
-
-static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
-{
- struct saved_cmdlines_buffer *s;
- struct page *page;
- int orig_size, size;
- int order;
-
- /* Figure out how much is needed to hold the given number of cmdlines */
- orig_size = sizeof(*s) + val * TASK_COMM_LEN;
- order = get_order(orig_size);
- size = 1 << (order + PAGE_SHIFT);
- page = alloc_pages(GFP_KERNEL, order);
- if (!page)
- return NULL;
-
- s = page_address(page);
- kmemleak_alloc(s, size, 1, GFP_KERNEL);
- memset(s, 0, sizeof(*s));
-
- /* Round up to actual allocation */
- val = (size - sizeof(*s)) / TASK_COMM_LEN;
- s->cmdline_num = val;
-
- s->map_cmdline_to_pid = kmalloc_array(val,
- sizeof(*s->map_cmdline_to_pid),
- GFP_KERNEL);
- if (!s->map_cmdline_to_pid) {
- free_saved_cmdlines_buffer(s);
- return NULL;
- }
-
- s->cmdline_idx = 0;
- memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
- sizeof(s->map_pid_to_cmdline));
- memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
- val * sizeof(*s->map_cmdline_to_pid));
-
- return s;
-}
-
-static int trace_create_savedcmd(void)
-{
- savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
-
- return savedcmd ? 0 : -ENOMEM;
-}
-
int is_tracing_stopped(void)
{
return global_trace.stop_count;
@@ -2483,201 +2474,6 @@ void tracing_stop(void)
return tracing_stop_tr(&global_trace);
}
-static int trace_save_cmdline(struct task_struct *tsk)
-{
- unsigned tpid, idx;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
-
- /*
- * It's not the end of the world if we don't get
- * the lock, but we also don't want to spin
- * nor do we want to disable interrupts,
- * so if we miss here, then better luck next time.
- *
- * This is called within the scheduler and wake up, so interrupts
- * had better been disabled and run queue lock been held.
- */
- lockdep_assert_preemption_disabled();
- if (!arch_spin_trylock(&trace_cmdline_lock))
- return 0;
-
- idx = savedcmd->map_pid_to_cmdline[tpid];
- if (idx == NO_CMDLINE_MAP) {
- idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
-
- savedcmd->map_pid_to_cmdline[tpid] = idx;
- savedcmd->cmdline_idx = idx;
- }
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- set_cmdline(idx, tsk->comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
-
- return 1;
-}
-
-static void __trace_find_cmdline(int pid, char comm[])
-{
- unsigned map;
- int tpid;
-
- if (!pid) {
- strcpy(comm, "<idle>");
- return;
- }
-
- if (WARN_ON_ONCE(pid < 0)) {
- strcpy(comm, "<XXX>");
- return;
- }
-
- tpid = pid & (PID_MAX_DEFAULT - 1);
- map = savedcmd->map_pid_to_cmdline[tpid];
- if (map != NO_CMDLINE_MAP) {
- tpid = savedcmd->map_cmdline_to_pid[map];
- if (tpid == pid) {
- strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- return;
- }
- }
- strcpy(comm, "<...>");
-}
-
-void trace_find_cmdline(int pid, char comm[])
-{
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- __trace_find_cmdline(pid, comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int *trace_find_tgid_ptr(int pid)
-{
- /*
- * Pairs with the smp_store_release in set_tracer_flag() to ensure that
- * if we observe a non-NULL tgid_map then we also observe the correct
- * tgid_map_max.
- */
- int *map = smp_load_acquire(&tgid_map);
-
- if (unlikely(!map || pid > tgid_map_max))
- return NULL;
-
- return &map[pid];
-}
-
-int trace_find_tgid(int pid)
-{
- int *ptr = trace_find_tgid_ptr(pid);
-
- return ptr ? *ptr : 0;
-}
-
-static int trace_save_tgid(struct task_struct *tsk)
-{
- int *ptr;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- ptr = trace_find_tgid_ptr(tsk->pid);
- if (!ptr)
- return 0;
-
- *ptr = tsk->tgid;
- return 1;
-}
-
-static bool tracing_record_taskinfo_skip(int flags)
-{
- if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
- return true;
- if (!__this_cpu_read(trace_taskinfo_save))
- return true;
- return false;
-}
-
-/**
- * tracing_record_taskinfo - record the task info of a task
- *
- * @task: task to record
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo(struct task_struct *task, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/**
- * tracing_record_taskinfo_sched_switch - record task info for sched_switch
- *
- * @prev: previous task during sched_switch
- * @next: next task during sched_switch
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
- struct task_struct *next, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
- done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/* Helpers to record a specific task information */
-void tracing_record_cmdline(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
-}
-
-void tracing_record_tgid(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_TGID);
-}
-
/*
* Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
* overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
@@ -4368,7 +4164,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
get_total_entries(buf, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
- name, UTS_RELEASE);
+ name, init_utsname()->release);
seq_puts(m, "# -----------------------------------"
"---------------------------------\n");
seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
@@ -5436,8 +5232,6 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
- int *map;
-
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -5460,20 +5254,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map) {
- tgid_map_max = pid_max;
- map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
- GFP_KERNEL);
- /*
- * Pairs with smp_load_acquire() in
- * trace_find_tgid_ptr() to ensure that if it observes
- * the tgid_map we just allocated then it also observes
- * the corresponding tgid_map_max value.
- */
- smp_store_release(&tgid_map, map);
- }
- if (!tgid_map) {
+ if (trace_alloc_tgid_map() < 0) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
}
@@ -5747,16 +5529,15 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
-#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t <argname>[->field[->field|.field...]],\n"
-#else
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
+ "\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t symstr, <type>\\[<array-size>\\]\n"
@@ -5918,207 +5699,6 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
-static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
-{
- int pid = ++(*pos);
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
-{
- int pid = *pos;
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void saved_tgids_stop(struct seq_file *m, void *v)
-{
-}
-
-static int saved_tgids_show(struct seq_file *m, void *v)
-{
- int *entry = (int *)v;
- int pid = entry - tgid_map;
- int tgid = *entry;
-
- if (tgid == 0)
- return SEQ_SKIP;
-
- seq_printf(m, "%d %d\n", pid, tgid);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_tgids_seq_ops = {
- .start = saved_tgids_start,
- .stop = saved_tgids_stop,
- .next = saved_tgids_next,
- .show = saved_tgids_show,
-};
-
-static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_tgids_seq_ops);
-}
-
-
-static const struct file_operations tracing_saved_tgids_fops = {
- .open = tracing_saved_tgids_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
-{
- unsigned int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
- ptr++) {
- if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
- continue;
-
- return ptr;
- }
-
- return NULL;
-}
-
-static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
-{
- void *v;
- loff_t l = 0;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- v = &savedcmd->map_cmdline_to_pid[0];
- while (l <= *pos) {
- v = saved_cmdlines_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
-}
-
-static void saved_cmdlines_stop(struct seq_file *m, void *v)
-{
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int saved_cmdlines_show(struct seq_file *m, void *v)
-{
- char buf[TASK_COMM_LEN];
- unsigned int *pid = v;
-
- __trace_find_cmdline(*pid, buf);
- seq_printf(m, "%d %s\n", *pid, buf);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
- .start = saved_cmdlines_start,
- .next = saved_cmdlines_next,
- .stop = saved_cmdlines_stop,
- .show = saved_cmdlines_show,
-};
-
-static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
-}
-
-static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_saved_cmdlines_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static ssize_t
-tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- int r;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static int tracing_resize_saved_cmdlines(unsigned int val)
-{
- struct saved_cmdlines_buffer *s, *savedcmd_temp;
-
- s = allocate_cmdlines_buffer(val);
- if (!s)
- return -ENOMEM;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- savedcmd_temp = savedcmd;
- savedcmd = s;
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
- free_saved_cmdlines_buffer(savedcmd_temp);
-
- return 0;
-}
-
-static ssize_t
-tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long val;
- int ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- /* must have at least 1 entry or less than PID_MAX_DEFAULT */
- if (!val || val > PID_MAX_DEFAULT)
- return -EINVAL;
-
- ret = tracing_resize_saved_cmdlines((unsigned int)val);
- if (ret < 0)
- return ret;
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static const struct file_operations tracing_saved_cmdlines_size_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_size_read,
- .write = tracing_saved_cmdlines_size_write,
-};
-
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static union trace_eval_map_item *
update_eval_map(union trace_eval_map_item *ptr)
@@ -6595,11 +6175,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
*/
synchronize_rcu();
free_snapshot(tr);
+ tracing_disarm_snapshot(tr);
}
- if (t->use_max_tr && !tr->allocated_snapshot) {
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
+ if (!had_max_tr && t->use_max_tr) {
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
goto out;
}
#else
@@ -6608,8 +6189,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t->init) {
ret = tracer_init(t, tr);
- if (ret)
+ if (ret) {
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (t->use_max_tr)
+ tracing_disarm_snapshot(tr);
+#endif
goto out;
+ }
}
tr->current_trace = t;
@@ -7711,10 +7297,11 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, iter->cpu_file);
- else
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
break;
+
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
local_irq_disable();
@@ -7724,6 +7311,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
(void *)tr, 1);
}
+ tracing_disarm_snapshot(tr);
break;
default:
if (tr->allocated_snapshot) {
@@ -8398,9 +7986,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- iter->wait_index++;
+ iter->closed = true;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8500,6 +8088,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ bool woken = false;
int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8573,17 +8162,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
- long wait_index;
if (ret)
goto out;
+ if (woken)
+ goto out;
+
ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- wait_index = READ_ONCE(iter->wait_index);
-
ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
@@ -8592,10 +8181,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if (!tracer_tracing_is_on(iter->tr))
goto out;
- /* Make sure we see the new wait_index */
- smp_rmb();
- if (wait_index != iter->wait_index)
- goto out;
+ /* Iterate one more time to collect any new data then exit */
+ woken = true;
goto again;
}
@@ -8618,9 +8205,8 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
mutex_lock(&trace_types_lock);
- iter->wait_index++;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8857,8 +8443,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
- if (glob[0] == '!')
- return unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
if (!param)
goto out_reg;
@@ -8877,12 +8468,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = tracing_alloc_snapshot_instance(tr);
+ ret = tracing_arm_snapshot(tr);
if (ret < 0)
goto out;
ret = register_ftrace_function_probe(glob, tr, ops, count);
-
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
out:
return ret < 0 ? ret : 0;
}
@@ -9689,7 +9281,9 @@ trace_array_create_systems(const char *name, const char *systems)
raw_spin_lock_init(&tr->start_lock);
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&tr->snapshot_trigger_lock);
+#endif
tr->current_trace = &nop_trace;
INIT_LIST_HEAD(&tr->systems);
@@ -10254,14 +9848,14 @@ static struct notifier_block trace_die_notifier = {
static int trace_die_panic_handler(struct notifier_block *self,
unsigned long ev, void *unused)
{
- if (!ftrace_dump_on_oops)
+ if (!ftrace_dump_on_oops_enabled())
return NOTIFY_DONE;
/* The die notifier requires DIE_OOPS to trigger */
if (self == &trace_die_notifier && ev != DIE_OOPS)
return NOTIFY_DONE;
- ftrace_dump(ftrace_dump_on_oops);
+ ftrace_dump(DUMP_PARAM);
return NOTIFY_DONE;
}
@@ -10302,12 +9896,12 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
-void trace_init_global_iter(struct trace_iterator *iter)
+static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr)
{
- iter->tr = &global_trace;
+ iter->tr = tr;
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
- iter->array_buffer = &global_trace.array_buffer;
+ iter->array_buffer = &tr->array_buffer;
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
@@ -10327,22 +9921,19 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ trace_init_iter(iter, &global_trace);
+}
+
+static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
- static atomic_t dump_running;
- struct trace_array *tr = &global_trace;
unsigned int old_userobj;
unsigned long flags;
int cnt = 0, cpu;
- /* Only allow one dump user at a time. */
- if (atomic_inc_return(&dump_running) != 1) {
- atomic_dec(&dump_running);
- return;
- }
-
/*
* Always turn off tracing when we dump.
* We don't need to show trace output of what happens
@@ -10351,12 +9942,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
* If the user does a sysrq-z, then they can re-enable
* tracing with echo 1 > tracing_on.
*/
- tracing_off();
+ tracer_tracing_off(tr);
local_irq_save(flags);
/* Simulate the iterator */
- trace_init_global_iter(&iter);
+ trace_init_iter(&iter, tr);
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -10367,21 +9958,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- switch (oops_dump_mode) {
- case DUMP_ALL:
- iter.cpu_file = RING_BUFFER_ALL_CPUS;
- break;
- case DUMP_ORIG:
+ if (dump_mode == DUMP_ORIG)
iter.cpu_file = raw_smp_processor_id();
- break;
- case DUMP_NONE:
- goto out_enable;
- default:
- printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ else
iter.cpu_file = RING_BUFFER_ALL_CPUS;
- }
- printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ if (tr == &global_trace)
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ else
+ printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name);
/* Did function tracer already get disabled? */
if (ftrace_is_dead()) {
@@ -10423,15 +10008,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
else
printk(KERN_TRACE "---------------------------------\n");
- out_enable:
tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
- atomic_dec(&dump_running);
local_irq_restore(flags);
}
+
+static void ftrace_dump_by_param(void)
+{
+ bool first_param = true;
+ char dump_param[MAX_TRACER_SIZE];
+ char *buf, *token, *inst_name;
+ struct trace_array *tr;
+
+ strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE);
+ buf = dump_param;
+
+ while ((token = strsep(&buf, ",")) != NULL) {
+ if (first_param) {
+ first_param = false;
+ if (!strcmp("0", token))
+ continue;
+ else if (!strcmp("1", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ continue;
+ }
+ else if (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ continue;
+ }
+ }
+
+ inst_name = strsep(&token, "=");
+ tr = trace_array_find(inst_name);
+ if (!tr) {
+ printk(KERN_TRACE "Instance %s not found\n", inst_name);
+ continue;
+ }
+
+ if (token && (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)))
+ ftrace_dump_one(tr, DUMP_ORIG);
+ else
+ ftrace_dump_one(tr, DUMP_ALL);
+ }
+}
+
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+{
+ static atomic_t dump_running;
+
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ break;
+ case DUMP_ORIG:
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ break;
+ case DUMP_PARAM:
+ ftrace_dump_by_param();
+ break;
+ case DUMP_NONE:
+ break;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ }
+
+ atomic_dec(&dump_running);
+}
EXPORT_SYMBOL_GPL(ftrace_dump);
#define WRITE_BUFSIZE 4096
@@ -10659,7 +10313,9 @@ __init static int tracer_alloc_buffers(void)
global_trace.current_trace = &nop_trace;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&global_trace.snapshot_trigger_lock);
+#endif
ftrace_init_global_array_ops(&global_trace);
init_trace_flags_index(&global_trace);
@@ -10696,7 +10352,7 @@ __init static int tracer_alloc_buffers(void)
out_free_pipe_cpumask:
free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
- free_saved_cmdlines_buffer(savedcmd);
+ trace_free_saved_cmdlines_buffer();
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
out_rm_hp_state:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 00f873910c5d..64450615ca0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,8 +334,8 @@ struct trace_array {
*/
struct array_buffer max_buffer;
bool allocated_snapshot;
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
+ spinlock_t snapshot_trigger_lock;
+ unsigned int snapshot;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -1375,6 +1375,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
+DECLARE_PER_CPU(bool, trace_taskinfo_save);
+int trace_save_cmdline(struct task_struct *tsk);
+int trace_create_savedcmd(void);
+int trace_alloc_tgid_map(void);
+void trace_free_saved_cmdlines_buffer(void);
+
+extern const struct file_operations tracing_saved_cmdlines_fops;
+extern const struct file_operations tracing_saved_tgids_fops;
+extern const struct file_operations tracing_saved_cmdlines_size_fops;
+
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
@@ -1973,12 +1983,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len)
#ifdef CONFIG_TRACER_SNAPSHOT
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot(struct trace_array *tr);
+void tracing_disarm_snapshot(struct trace_array *tr);
#else
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
+static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
#endif
#ifdef CONFIG_PREEMPT_TRACER
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 54d5fa35c90a..811b08439406 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -92,7 +92,6 @@ static void trace_do_benchmark(void)
bm_total += delta;
bm_totalsq += delta * delta;
-
if (bm_cnt > 1) {
/*
* Apply Welford's method to calculate standard deviation:
@@ -105,7 +104,7 @@ static void trace_do_benchmark(void)
stddev = 0;
delta = bm_total;
- do_div(delta, bm_cnt);
+ delta = div64_u64(delta, bm_cnt);
avg = delta;
if (stddev > 0) {
@@ -127,7 +126,7 @@ static void trace_do_benchmark(void)
seed = stddev;
if (!last_seed)
break;
- do_div(seed, last_seed);
+ seed = div64_u64(seed, last_seed);
seed += last_seed;
do_div(seed, 2);
} while (i++ < 10 && last_seed != seed);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 03c851f57969..b0e0ec85912e 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -220,7 +220,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
if (!ep->event_system)
goto error;
- ret = trace_probe_init(&ep->tp, this_event, group, false);
+ ret = trace_probe_init(&ep->tp, this_event, group, false, nargs);
if (ret < 0)
goto error;
@@ -390,8 +390,8 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
unsigned long val;
int ret;
@@ -438,7 +438,7 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &edata->ep->tp, rec, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b33c3861fbbb..4bec043c8690 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -597,20 +597,12 @@ out:
return ret;
}
-/**
- * unregister_trigger - Generic event_command @unreg implementation
- * @glob: The raw string used to register the trigger
- * @test: Trigger-specific data used to find the trigger to remove
- * @file: The trace_event_file associated with the event
- *
- * Common implementation for event trigger unregistration.
- *
- * Usually used directly as the @unreg method in event command
- * implementations.
+/*
+ * True if the trigger was found and unregistered, else false.
*/
-static void unregister_trigger(char *glob,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+static bool try_unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct event_trigger_data *data = NULL, *iter;
@@ -626,8 +618,32 @@ static void unregister_trigger(char *glob,
}
}
- if (data && data->ops->free)
- data->ops->free(data);
+ if (data) {
+ if (data->ops->free)
+ data->ops->free(data);
+
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The trace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
+{
+ try_unregister_trigger(glob, test, file);
}
/*
@@ -1470,12 +1486,23 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = tracing_alloc_snapshot_instance(file->tr);
+ int ret = tracing_arm_snapshot(file->tr);
if (ret < 0)
return ret;
- return register_trigger(glob, data, file);
+ ret = register_trigger(glob, data, file);
+ if (ret < 0)
+ tracing_disarm_snapshot(file->tr);
+ return ret;
+}
+
+static void unregister_snapshot_trigger(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ if (try_unregister_trigger(glob, data, file))
+ tracing_disarm_snapshot(file->tr);
}
static int
@@ -1510,7 +1537,7 @@ static struct event_command trigger_snapshot_cmd = {
.trigger_type = ETT_SNAPSHOT,
.parse = event_trigger_parse,
.reg = register_snapshot_trigger,
- .unreg = unregister_trigger,
+ .unreg = unregister_snapshot_trigger,
.get_trigger_ops = snapshot_get_trigger_ops,
.set_filter = set_trigger_filter,
};
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e76f5e1efdf2..70d428c394b6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
/*
@@ -54,10 +55,13 @@
* allows isolation for events by various means.
*/
struct user_event_group {
- char *system_name;
- struct hlist_node node;
- struct mutex reg_mutex;
+ char *system_name;
+ char *system_multi_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+ /* ID that moves forward within the group for multi-event names */
+ u64 multi_id;
};
/* Group for init_user_ns mapping, top-most group */
@@ -78,6 +82,7 @@ static unsigned int current_user_events;
*/
struct user_event {
struct user_event_group *group;
+ char *reg_name;
struct tracepoint tracepoint;
struct trace_event_call call;
struct trace_event_class class;
@@ -127,6 +132,8 @@ struct user_event_enabler {
#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
+#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT)
+
/* Used for asynchronous faulting in of pages */
struct user_event_enabler_fault {
struct work_struct work;
@@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
static u32 user_event_key(char *name)
{
@@ -328,6 +337,7 @@ out:
static void user_event_group_destroy(struct user_event_group *group)
{
kfree(group->system_name);
+ kfree(group->system_multi_name);
kfree(group);
}
@@ -346,6 +356,11 @@ static char *user_event_group_system_name(void)
return system_name;
}
+static char *user_event_group_system_multi_name(void)
+{
+ return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL);
+}
+
static struct user_event_group *current_user_event_group(void)
{
return init_group;
@@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void)
if (!group->system_name)
goto error;
+ group->system_multi_name = user_event_group_system_multi_name();
+
+ if (!group->system_multi_name)
+ goto error;
+
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user)
hash_del(&user->node);
user_event_destroy_validators(user);
+
+ /* If we have different names, both must be freed */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user->call.print_fmt);
kfree(EVENT_NAME(user));
kfree(user);
@@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user)
}
static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char **argv,
+ u32 flags, u32 *outkey)
{
struct user_event *user;
u32 key = user_event_key(name);
*outkey = key;
- hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name))
+ hash_for_each_possible(group->register_table, user, node, key) {
+ /*
+ * Single-format events shouldn't return multi-format
+ * events. Callers expect the underlying tracepoint to match
+ * the name exactly in these cases. Only check like-formats.
+ */
+ if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags))
+ continue;
+
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
+
+ if (user_fields_match(user, argc, argv))
return user_event_get(user);
+ /* Scan others if this is a multi-format event */
+ if (EVENT_MULTI_FORMAT(flags))
+ continue;
+
+ return ERR_PTR(-EADDRINUSE);
+ }
+
return NULL;
}
@@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc,
struct list_head *head = &user->fields;
int i = 0;
+ if (argc == 0)
+ return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, &i))
return false;
@@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event,
struct user_event *user = container_of(ev, struct user_event, devent);
bool match;
- match = strcmp(EVENT_NAME(user), event) == 0 &&
- (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+ match = strcmp(EVENT_NAME(user), event) == 0;
+
+ if (match && system) {
+ match = strcmp(system, user->group->system_name) == 0 ||
+ strcmp(system, user->group->system_multi_name) == 0;
+ }
- if (match && argc > 0)
+ if (match)
match = user_fields_match(user, argc, argv);
- else if (match && argc == 0)
- match = list_empty(&user->fields);
return match;
}
@@ -1913,6 +1962,33 @@ static int user_event_trace_register(struct user_event *user)
return ret;
}
+static int user_event_set_tp_name(struct user_event *user)
+{
+ lockdep_assert_held(&user->group->reg_mutex);
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags)) {
+ char *multi_name;
+
+ multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx",
+ user->reg_name, user->group->multi_id);
+
+ if (!multi_name)
+ return -ENOMEM;
+
+ user->call.name = multi_name;
+ user->tracepoint.name = multi_name;
+
+ /* Inc to ensure unique multi-event name next time */
+ user->group->multi_id++;
+ } else {
+ /* Non Multi-format uses register name */
+ user->call.name = user->reg_name;
+ user->tracepoint.name = user->reg_name;
+ }
+
+ return 0;
+}
+
/*
* Parses the event name, arguments and flags then registers if successful.
* The name buffer lifetime is owned by this method for success cases only.
@@ -1922,11 +1998,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
{
- int ret;
- u32 key;
struct user_event *user;
+ char **argv = NULL;
int argc = 0;
- char **argv;
+ int ret;
+ u32 key;
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +2011,34 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
+ if (args) {
+ argv = argv_split(GFP_KERNEL, args, &argc);
+
+ if (!argv)
+ return -ENOMEM;
+ }
+
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
- user = find_user_event(group, name, &key);
+ user = find_user_event(group, name, argc, (const char **)argv,
+ reg_flags, &key);
mutex_unlock(&event_mutex);
- if (user) {
- if (args) {
- argv = argv_split(GFP_KERNEL, args, &argc);
- if (!argv) {
- ret = -ENOMEM;
- goto error;
- }
+ if (argv)
+ argv_free(argv);
- ret = user_fields_match(user, argc, (const char **)argv);
- argv_free(argv);
-
- } else
- ret = list_empty(&user->fields);
-
- if (ret) {
- *newuser = user;
- /*
- * Name is allocated by caller, free it since it already exists.
- * Caller only worries about failure cases for freeing.
- */
- kfree(name);
- } else {
- ret = -EADDRINUSE;
- goto error;
- }
+ if (IS_ERR(user))
+ return PTR_ERR(user);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
return 0;
-error:
- user_event_put(user, false);
- return ret;
}
user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
@@ -1982,7 +2051,13 @@ error:
INIT_LIST_HEAD(&user->validators);
user->group = group;
- user->tracepoint.name = name;
+ user->reg_name = name;
+ user->reg_flags = reg_flags;
+
+ ret = user_event_set_tp_name(user);
+
+ if (ret)
+ goto put_user;
ret = user_event_parse_fields(user, args);
@@ -1996,11 +2071,14 @@ error:
user->call.data = user;
user->call.class = &user->class;
- user->call.name = name;
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
user->call.tp = &user->tracepoint;
user->call.event.funcs = &user_event_funcs;
- user->class.system = group->system_name;
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags))
+ user->class.system = group->system_multi_name;
+ else
+ user->class.system = group->system_name;
user->class.fields_array = user_event_fields_array;
user->class.get_fields = user_event_get_fields;
@@ -2022,8 +2100,6 @@ error:
if (ret)
goto put_user_lock;
- user->reg_flags = reg_flags;
-
if (user->reg_flags & USER_EVENT_REG_PERSIST) {
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
@@ -2047,30 +2123,43 @@ put_user:
user_event_destroy_fields(user);
user_event_destroy_validators(user);
kfree(user->call.print_fmt);
+
+ /* Caller frees reg_name on error, but not multi-name */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user);
return ret;
}
/*
- * Deletes a previously created event if it is no longer being used.
+ * Deletes previously created events if they are no longer being used.
*/
static int delete_user_event(struct user_event_group *group, char *name)
{
- u32 key;
- struct user_event *user = find_user_event(group, name, &key);
+ struct user_event *user;
+ struct hlist_node *tmp;
+ u32 key = user_event_key(name);
+ int ret = -ENOENT;
- if (!user)
- return -ENOENT;
+ /* Attempt to delete all event(s) with the name passed in */
+ hash_for_each_possible_safe(group->register_table, user, tmp, node, key) {
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
- user_event_put(user, true);
+ if (!user_event_last_ref(user))
+ return -EBUSY;
- if (!user_event_last_ref(user))
- return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
- if (!user_event_capable(user->reg_flags))
- return -EPERM;
+ ret = destroy_user_event(user);
- return destroy_user_event(user);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
}
/*
@@ -2628,7 +2717,7 @@ static int user_seq_show(struct seq_file *m, void *p)
hash_for_each(group->register_table, i, user, node) {
status = user->status;
- seq_printf(m, "%s", EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_TP_NAME(user));
if (status != 0)
seq_puts(m, " #");
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 7d2ddbcfa377..4f4280815522 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,6 +4,7 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
+#include <asm/ptrace.h>
#include <linux/fprobe.h>
#include <linux/module.h>
@@ -129,8 +130,8 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
* from user space.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -152,6 +153,9 @@ retry:
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
+ break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
@@ -184,7 +188,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
@@ -194,7 +198,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -210,11 +214,24 @@ fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
}
NOKPROBE_SYMBOL(fentry_trace_func);
-/* Kretprobe handler */
+/* function exit handler */
+static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+
+ if (tf->tp.entry_arg)
+ store_trace_entry_data(entry_data, &tf->tp, regs);
+
+ return 0;
+}
+NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
+
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
- struct trace_event_file *trace_file)
+ void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
@@ -227,7 +244,7 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
@@ -238,19 +255,19 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct pt_regs *regs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
@@ -269,7 +286,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -280,7 +297,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -289,7 +306,8 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
@@ -301,7 +319,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -312,7 +330,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -343,10 +361,10 @@ static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs);
+ fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs);
+ fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -389,7 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->tpoint = tpoint;
tf->fp.nr_maxactive = maxactive;
- ret = trace_probe_init(&tf->tp, event, group, false);
+ ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
goto error;
@@ -1109,6 +1127,11 @@ static int __trace_fprobe_create(int argc, const char *argv[])
goto error; /* This can be -ENOMEM */
}
+ if (is_return && tf->tp.entry_arg) {
+ tf->fp.entry_handler = trace_fprobe_entry_handler;
+ tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp);
+ }
+
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..14099cc17fc9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group, false);
+ ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
goto error;
@@ -740,6 +740,9 @@ static unsigned int number_of_same_symbols(char *func_name)
return ctx.count;
}
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
@@ -948,6 +951,11 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (ret)
goto error; /* This can be -ENOMEM */
}
+ /* entry handler for kretprobe */
+ if (is_return && tk->tp.entry_arg) {
+ tk->rp.entry_handler = trace_kprobe_entry_handler;
+ tk->rp.data_size = traceprobe_get_entry_data_size(&tk->tp);
+ }
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
@@ -1303,8 +1311,8 @@ static const struct file_operations kprobe_profile_ops = {
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -1329,6 +1337,9 @@ retry:
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
+ break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
@@ -1359,7 +1370,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1368,7 +1379,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
fbuffer.regs = regs;
entry->ip = (unsigned long)tk->rp.kp.addr;
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1384,6 +1395,31 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
+
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+ struct trace_kprobe *tk;
+
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return -ENOENT;
+
+ tk = container_of(rp, struct trace_kprobe, rp);
+
+ /* store argument values into ri->data as entry data */
+ if (tk->tp.entry_arg)
+ store_trace_entry_data(ri->data, &tk->tp, regs);
+
+ return 0;
+}
+
+
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
@@ -1399,7 +1435,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1409,7 +1445,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1557,7 +1593,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1568,7 +1604,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -1593,7 +1629,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1604,7 +1640,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 34289f9c6707..217169de0920 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -594,6 +594,8 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return 0;
}
+static int __store_entry_arg(struct trace_probe *tp, int argnum);
+
static int parse_btf_arg(char *varname,
struct fetch_insn **pcode, struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
@@ -618,11 +620,7 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
- if (ctx->flags & TPARG_FL_RETURN) {
- if (strcmp(varname, "$retval") != 0) {
- trace_probe_log_err(ctx->offset, NO_BTFARG);
- return -ENOENT;
- }
+ if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void */
if (query_btf_context(ctx) == 0) {
@@ -654,11 +652,21 @@ static int parse_btf_arg(char *varname,
const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
if (name && !strcmp(name, varname)) {
- code->op = FETCH_OP_ARG;
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param = i + 1;
- else
- code->param = i;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param = i + 1;
+ else
+ code->param = i;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ code->op = FETCH_OP_EDATA;
+ ret = __store_entry_arg(ctx->tp, i);
+ if (ret < 0) {
+ /* internal error */
+ return ret;
+ }
+ code->offset = ret;
+ }
tid = params[i].type;
goto found;
}
@@ -755,6 +763,110 @@ static int check_prepare_btf_string_fetch(char *typename,
#endif
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+
+static int __store_entry_arg(struct trace_probe *tp, int argnum)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ bool match = false;
+ int i, offset;
+
+ if (!earg) {
+ earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
+ if (!earg)
+ return -ENOMEM;
+ earg->size = 2 * tp->nr_args + 1;
+ earg->code = kcalloc(earg->size, sizeof(struct fetch_insn),
+ GFP_KERNEL);
+ if (!earg->code) {
+ kfree(earg);
+ return -ENOMEM;
+ }
+ /* Fill the code buffer with 'end' to simplify it */
+ for (i = 0; i < earg->size; i++)
+ earg->code[i].op = FETCH_OP_END;
+ tp->entry_arg = earg;
+ }
+
+ offset = 0;
+ for (i = 0; i < earg->size - 1; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ earg->code[i].op = FETCH_OP_ARG;
+ earg->code[i].param = argnum;
+ earg->code[i + 1].op = FETCH_OP_ST_EDATA;
+ earg->code[i + 1].offset = offset;
+ return offset;
+ case FETCH_OP_ARG:
+ match = (earg->code[i].param == argnum);
+ break;
+ case FETCH_OP_ST_EDATA:
+ offset = earg->code[i].offset;
+ if (match)
+ return offset;
+ offset += sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+ return -ENOSPC;
+}
+
+int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ int i, size = 0;
+
+ if (!earg)
+ return 0;
+
+ for (i = 0; i < earg->size; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ goto out;
+ case FETCH_OP_ST_EDATA:
+ size = earg->code[i].offset + sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+out:
+ return size;
+}
+
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+NOKPROBE_SYMBOL(store_trace_entry_data)
+#endif
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
@@ -830,7 +942,7 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
len = str_has_prefix(arg, "arg");
- if (len && tparg_is_function_entry(ctx->flags)) {
+ if (len) {
ret = kstrtoul(arg + len, 10, &param);
if (ret)
goto inval;
@@ -839,15 +951,29 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
err = TP_ERR_BAD_ARG_NUM;
goto inval;
}
+ param--; /* argN starts from 1, but internal arg[N] starts from 0 */
- code->op = FETCH_OP_ARG;
- code->param = (unsigned int)param - 1;
- /*
- * The tracepoint probe will probe a stub function, and the
- * first parameter of the stub is a dummy and should be ignored.
- */
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param++;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ code->param = (unsigned int)param;
+ /*
+ * The tracepoint probe will probe a stub function, and the
+ * first parameter of the stub is a dummy and should be ignored.
+ */
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param++;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ /* function entry argument access from return probe */
+ ret = __store_entry_arg(ctx->tp, param);
+ if (ret < 0) /* This error should be an internal error */
+ return ret;
+
+ code->op = FETCH_OP_EDATA;
+ code->offset = ret;
+ } else {
+ err = TP_ERR_NOFENTRY_ARGS;
+ goto inval;
+ }
return 0;
}
#endif
@@ -1037,7 +1163,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
default:
if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EINVAL;
}
@@ -1090,67 +1217,45 @@ static int __parse_bitfield_probe_arg(const char *bf,
return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
}
-/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
- struct probe_arg *parg,
- struct traceprobe_parse_context *ctx)
+/* Split type part from @arg and return it. */
+static char *parse_probe_arg_type(char *arg, struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
{
- struct fetch_insn *code, *scode, *tmp = NULL;
- char *t, *t2, *t3;
- int ret, len;
- char *arg;
-
- arg = kstrdup(argv, GFP_KERNEL);
- if (!arg)
- return -ENOMEM;
-
- ret = -EINVAL;
- len = strlen(arg);
- if (len > MAX_ARGSTR_LEN) {
- trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
- goto out;
- } else if (len == 0) {
- trace_probe_log_err(ctx->offset, NO_ARG_BODY);
- goto out;
- }
-
- ret = -ENOMEM;
- parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm)
- goto out;
+ char *t = NULL, *t2, *t3;
+ int offs;
- ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
- *t = '\0';
- t2 = strchr(++t, '[');
+ *t++ = '\0';
+ t2 = strchr(t, '[');
if (t2) {
*t2++ = '\0';
t3 = strchr(t2, ']');
if (!t3) {
- int offs = t2 + strlen(t2) - arg;
+ offs = t2 + strlen(t2) - arg;
trace_probe_log_err(ctx->offset + offs,
ARRAY_NO_CLOSE);
- goto out;
+ return ERR_PTR(-EINVAL);
} else if (t3[1] != '\0') {
trace_probe_log_err(ctx->offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- goto out;
+ return ERR_PTR(-EINVAL);
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
trace_probe_log_err(ctx->offset + t2 - arg,
BAD_ARRAY_NUM);
- goto out;
+ return ERR_PTR(-EINVAL);
}
if (parg->count > MAX_ARRAY_LEN) {
trace_probe_log_err(ctx->offset + t2 - arg,
ARRAY_TOO_BIG);
- goto out;
+ return ERR_PTR(-EINVAL);
}
}
}
+ offs = t ? t - arg : 0;
/*
* Since $comm and immediate string can not be dereferenced,
@@ -1161,74 +1266,52 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array type. */
if (parg->count || (t && strcmp(t, "string"))) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- NEED_STRING_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, NEED_STRING_TYPE);
+ return ERR_PTR(-EINVAL);
}
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
+
if (!parg->type) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, BAD_TYPE);
+ return ERR_PTR(-EINVAL);
}
- code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code)
- goto out;
- code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
-
- ctx->last_type = NULL;
- ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- ctx);
- if (ret)
- goto fail;
-
- /* Update storing type if BTF is available */
- if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
- ctx->last_type) {
- if (!t) {
- parg->type = find_fetch_type_from_btf_type(ctx);
- } else if (strstr(t, "string")) {
- ret = check_prepare_btf_string_fetch(t, &code, ctx);
- if (ret)
- goto fail;
- }
- }
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
+ return t;
+}
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt) {
- ret = -ENOMEM;
- goto out;
- }
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
+/* After parsing, adjust the fetch_insn according to the probe_arg */
+static int finalize_fetch_insn(struct fetch_insn *code,
+ struct probe_arg *parg,
+ char *type,
+ int type_offset,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *scode;
+ int ret;
- ret = -EINVAL;
/* Store operation */
if (parg->type->is_string) {
+ /* Check bad combination of the type and the last fetch_insn. */
if (!strcmp(parg->type->name, "symstr")) {
if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_SYMSTRING);
- goto fail;
+ return -EINVAL;
}
} else {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_STRING);
- goto fail;
+ return -EINVAL;
}
}
+
if (!strcmp(parg->type->name, "symstr") ||
(code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
@@ -1244,9 +1327,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -EINVAL;
}
}
+
/* If op == DEREF, replace it with STRING */
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
@@ -1267,47 +1351,134 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_ST_RAW;
code->size = parg->type->size;
}
+
+ /* Save storing fetch_insn. */
scode = code;
+
/* Modify operation */
- if (t != NULL) {
- ret = __parse_bitfield_probe_arg(t, parg->type, &code);
+ if (type != NULL) {
+ /* Bitfield needs a special fetch_insn. */
+ ret = __parse_bitfield_probe_arg(type, parg->type, &code);
if (ret) {
- trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_BITFIELD);
+ return ret;
}
} else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
ctx->last_type) {
+ /* If user not specified the type, try parsing BTF bitfield. */
ret = parse_btf_bitfield(&code, ctx);
if (ret)
- goto fail;
+ return ret;
}
- ret = -EINVAL;
+
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING &&
scode->op != FETCH_OP_ST_USTRING) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- BAD_STRING);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_STRING);
+ return -EINVAL;
}
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_LP_ARRAY;
code->param = parg->count;
}
+
+ /* Finalize the fetch_insn array. */
code++;
code->op = FETCH_OP_END;
- ret = 0;
+ return 0;
+}
+
+/* String length checking wrapper */
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
+ struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code, *tmp = NULL;
+ char *type, *arg;
+ int ret, len;
+
+ len = strlen(argv);
+ if (len > MAX_ARGSTR_LEN) {
+ trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
+ return -E2BIG;
+ } else if (len == 0) {
+ trace_probe_log_err(ctx->offset, NO_ARG_BODY);
+ return -EINVAL;
+ }
+
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ type = parse_probe_arg_type(arg, parg, ctx);
+ if (IS_ERR(type)) {
+ ret = PTR_ERR(type);
+ goto out;
+ }
+
+ code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
+ if (!code) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+
+ ctx->last_type = NULL;
+ ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
+ ctx);
+ if (ret < 0)
+ goto fail;
+
+ /* Update storing type if BTF is available */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!type) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(type, "string")) {
+ ret = check_prepare_btf_string_fetch(type, &code, ctx);
+ if (ret)
+ goto fail;
+ }
+ }
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
+
+ ret = finalize_fetch_insn(code, parg, type, type ? type - arg : 0, ctx);
+ if (ret < 0)
+ goto fail;
+
+ for (; code < tmp + FETCH_INSN_MAX; code++)
+ if (code->op == FETCH_OP_END)
+ break;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -1316,7 +1487,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1));
fail:
- if (ret) {
+ if (ret < 0) {
for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
if (code->op == FETCH_NOP_SYMBOL ||
code->op == FETCH_OP_DATA)
@@ -1379,9 +1550,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
struct probe_arg *parg = &tp->args[i];
const char *body;
- /* Increment count for freeing args in error case */
- tp->nr_args++;
-
+ ctx->tp = tp;
body = strchr(arg, '=');
if (body) {
if (body - arg > MAX_ARG_NAME_LEN) {
@@ -1438,7 +1607,8 @@ static int argv_has_var_arg(int argc, const char *argv[], int *args_idx,
if (str_has_prefix(argv[i], "$arg")) {
trace_probe_log_set_index(i + 2);
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(0, NOFENTRY_ARGS);
return -EINVAL;
}
@@ -1761,12 +1931,18 @@ void trace_probe_cleanup(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
+ if (tp->entry_arg) {
+ kfree(tp->entry_arg->code);
+ kfree(tp->entry_arg);
+ tp->entry_arg = NULL;
+ }
+
if (tp->event)
trace_probe_unlink(tp);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter)
+ const char *group, bool alloc_filter, int nargs)
{
struct trace_event_call *call;
size_t size = sizeof(struct trace_probe_event);
@@ -1802,6 +1978,11 @@ int trace_probe_init(struct trace_probe *tp, const char *event,
goto error;
}
+ tp->nr_args = nargs;
+ /* Make sure pointers in args[] are NULL */
+ if (nargs)
+ memset(tp->args, 0, sizeof(tp->args[0]) * nargs);
+
return 0;
error:
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index c1877d018269..cef3a50628a3 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -92,6 +92,7 @@ enum fetch_op {
FETCH_OP_ARG, /* Function argument : .param */
FETCH_OP_FOFFS, /* File offset: .immediate */
FETCH_OP_DATA, /* Allocated data: .data */
+ FETCH_OP_EDATA, /* Entry data: .offset */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
@@ -102,6 +103,7 @@ enum fetch_op {
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
+ FETCH_OP_ST_EDATA, /* Store Entry Data: .offset */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -232,6 +234,11 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
+struct probe_entry_arg {
+ struct fetch_insn *code;
+ unsigned int size; /* The entry data size */
+};
+
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
@@ -253,6 +260,7 @@ struct trace_probe {
struct trace_probe_event *event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
+ struct probe_entry_arg *entry_arg; /* This is only for return probe */
struct probe_arg args[];
};
@@ -338,7 +346,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter);
+ const char *group, bool alloc_filter, int nargs);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
@@ -355,6 +363,18 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
u8 *data, void *field);
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+int traceprobe_get_entry_data_size(struct trace_probe *tp);
+/* This is a runtime function to store entry data */
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs);
+#else /* !CONFIG_HAVE_FUNCTION_ARG_ACCESS_API */
+static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ return 0;
+}
+#define store_trace_entry_data(edata, tp, regs) do { } while (0)
+#endif
+
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
@@ -381,6 +401,11 @@ static inline bool tparg_is_function_entry(unsigned int flags)
return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY);
}
+static inline bool tparg_is_function_return(unsigned int flags)
+{
+ return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN);
+}
+
struct traceprobe_parse_context {
struct trace_event_call *event;
/* BTF related parameters */
@@ -392,6 +417,7 @@ struct traceprobe_parse_context {
const struct btf_type *last_type; /* Saved type */
u32 last_bitoffs; /* Saved bitoffs */
u32 last_bitsize; /* Saved bitsize */
+ struct trace_probe *tp;
unsigned int flags;
int offset;
};
@@ -506,7 +532,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTFARG, "This variable is not found at this probe point"),\
C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \
C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
- C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
+ C(NOFENTRY_ARGS, "$arg* can be used only on function entry or exit"), \
C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
C(ARGIDX_2BIG, "$argN index is too big"), \
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 3935b347f874..2caf0d2afb32 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
@@ -240,7 +240,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
for (i = 0; i < tp->nr_args; i++) {
arg = tp->args + i;
if (unlikely(arg->dynamic)) {
- len = process_fetch_insn(arg->code, regs, NULL, NULL);
+ len = process_fetch_insn(arg->code, regs, edata, NULL, NULL);
if (len > 0)
ret += len;
}
@@ -251,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, void *rec,
+store_trace_args(void *data, struct trace_probe *tp, void *rec, void *edata,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -266,7 +266,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, rec, dl, base);
+ ret = process_fetch_insn(arg->code, rec, edata, dl, base);
if (arg->dynamic && likely(ret > 0)) {
dyndata += ret;
maxlen -= ret;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c9ffdcfe622e..8a407adb0e1c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
+#include <linux/kmemleak.h>
#include <linux/ftrace.h>
#include <trace/events/sched.h>
@@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void)
{
tracing_stop_sched_switch(RECORD_TGID);
}
+
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
+static int *tgid_map;
+
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
+#define SAVED_CMDLINES_DEFAULT 128
+#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char saved_cmdlines[];
+};
+static struct saved_cmdlines_buffer *savedcmd;
+
+/* Holds the size of a cmdline and pid element */
+#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \
+ (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
+
+static inline char *get_saved_cmdlines(int idx)
+{
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+}
+
+static inline void set_cmdline(int idx, const char *cmdline)
+{
+ strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ s->cmdline_num = val;
+
+ /* Place map_cmdline_to_pid array right after saved_cmdlines */
+ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
+
+ s->cmdline_idx = 0;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return s;
+}
+
+int trace_create_savedcmd(void)
+{
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
+
+ return savedcmd ? 0 : -ENOMEM;
+}
+
+int trace_save_cmdline(struct task_struct *tsk)
+{
+ unsigned tpid, idx;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
+
+ /*
+ * It's not the end of the world if we don't get
+ * the lock, but we also don't want to spin
+ * nor do we want to disable interrupts,
+ * so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
+ */
+ lockdep_assert_preemption_disabled();
+ if (!arch_spin_trylock(&trace_cmdline_lock))
+ return 0;
+
+ idx = savedcmd->map_pid_to_cmdline[tpid];
+ if (idx == NO_CMDLINE_MAP) {
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
+
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
+ savedcmd->cmdline_idx = idx;
+ }
+
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ set_cmdline(idx, tsk->comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
+}
+
+static void __trace_find_cmdline(int pid, char comm[])
+{
+ unsigned map;
+ int tpid;
+
+ if (!pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
+
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
+ }
+ strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
+int trace_find_tgid(int pid)
+{
+ int *ptr = trace_find_tgid_ptr(pid);
+
+ return ptr ? *ptr : 0;
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
+{
+ int *ptr;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
+ return 0;
+
+ *ptr = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task: task to record
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev: previous task during sched_switch
+ * @next: next task during sched_switch
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+ done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
+}
+
+int trace_alloc_tgid_map(void)
+{
+ int *map;
+
+ if (tgid_map)
+ return 0;
+
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ return 0;
+}
+
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int pid = ++(*pos);
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+ int pid = *pos;
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
+
+ seq_printf(m, "%d %d\n", pid, tgid);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+ .start = saved_tgids_start,
+ .stop = saved_tgids_stop,
+ .next = saved_tgids_next,
+ .show = saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_saved_tgids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ unsigned int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+
+const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static ssize_t
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+void trace_free_saved_cmdlines_buffer(void)
+{
+ free_saved_cmdlines_buffer(savedcmd);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
+
+ s = allocate_cmdlines_buffer(val);
+ if (!s)
+ return -ENOMEM;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ free_saved_cmdlines_buffer(savedcmd_temp);
+
+ return 0;
+}
+
+static ssize_t
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
+
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
+};
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 529590499b1f..e9c5058a8efd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -768,7 +768,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops) {
+ if (ftrace_dump_on_oops_enabled()) {
ftrace_dump(DUMP_ALL);
/* ftrace_dump() disables tracing */
tracing_on();
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..9e461362450a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,8 +211,8 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -337,7 +337,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group, true);
+ ret = trace_probe_init(&tu->tp, event, group, true, nargs);
if (ret < 0)
goto error;
@@ -1490,11 +1490,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
+ dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
ret |= uprobe_trace_func(tu, regs, ucb, dsize);
@@ -1525,11 +1525,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
+ dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
uretprobe_trace_func(tu, func, regs, ucb, dsize);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce4d99df5f0e..0b0b95418b16 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
- char *kbuf = NULL, *pos, *next_line;
+ char *kbuf, *pos, *next_line;
ssize_t ret;
/* Only allow < page size writes at the beginning of the file */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
new file mode 100644
index 000000000000..f95516cd45bb
--- /dev/null
+++ b/kernel/vmcore_info.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* vmcoreinfo stuff */
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len)
+{
+ struct elf_note *note = (struct elf_note *)buf;
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = data_len;
+ note->n_type = type;
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+ memcpy(buf, name, note->n_namesz);
+ buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+ return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+ memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+ if (ptr)
+ memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+ vmcoreinfo_data_safecopy = ptr;
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ if (!vmcoreinfo_note)
+ return;
+
+ /* Use the safe copy to generate vmcoreinfo note if have */
+ if (vmcoreinfo_data_safecopy)
+ vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
+ vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa(vmcoreinfo_note);
+}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ if (!vmcoreinfo_data) {
+ pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+ return -ENOMEM;
+ }
+
+ vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!vmcoreinfo_note) {
+ free_page((unsigned long)vmcoreinfo_data);
+ vmcoreinfo_data = NULL;
+ pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+ return -ENOMEM;
+ }
+
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_BUILD_ID();
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_OFFSET(uts_namespace, name);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
+
+#ifndef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ VMCOREINFO_SYMBOL_ARRAY(vmemmap);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+ VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
+ VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _refcount);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_head);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLATMEM
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
+ log_buf_vmcoreinfo_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_swapbacked);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(PG_hugetlb);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+#endif
+
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 81a8862295d6..d7b2125503af 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -796,8 +796,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
/*
* /proc/sys/kernel/watchdog
*/
-int proc_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
WATCHDOG_SOFTOCKUP_ENABLED,
@@ -807,8 +807,8 @@ int proc_watchdog(struct ctl_table *table, int write,
/*
* /proc/sys/kernel/nmi_watchdog
*/
-int proc_nmi_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!watchdog_hardlockup_available && write)
return -ENOTSUPP;
@@ -816,21 +816,23 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
table, write, buffer, lenp, ppos);
}
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
/*
* /proc/sys/kernel/soft_watchdog
*/
-int proc_soft_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_soft_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
+#endif
/*
* /proc/sys/kernel/watchdog_thresh
*/
-int proc_watchdog_thresh(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -852,8 +854,8 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* user to specify a mask that will include cpus that have not yet
* been brought online, if desired.
*/
-int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog_cpumask(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fbb0ec39079..f397510edc9b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7080,7 +7080,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
__ATTR_NULL,
};
-static struct bus_type wq_subsys = {
+static const struct bus_type wq_subsys = {
.name = "workqueue",
.dev_groups = wq_sysfs_groups,
};