diff options
Diffstat (limited to 'kernel')
125 files changed, 2957 insertions, 1734 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 6c34e63c88ff..4d111f871951 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,7 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" - default y + default ARCH_DEFAULT_CRASH_DUMP depends on ARCH_SUPPORTS_CRASH_DUMP depends on KEXEC_CORE select VMCORE_INFO diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fe782cd77388..54ea59ff8fbe 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -11,12 +11,16 @@ config PREEMPT_BUILD select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK +config ARCH_HAS_PREEMPT_LAZY + bool + choice prompt "Preemption Model" default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" + depends on !PREEMPT_RT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -32,6 +36,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT + depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more @@ -51,7 +56,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_BUILD + select PREEMPT_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -67,9 +72,23 @@ config PREEMPT embedded system with latency requirements in the milliseconds range. +config PREEMPT_LAZY + bool "Scheduler controlled preemption model" + depends on !ARCH_NO_PREEMPT + depends on ARCH_HAS_PREEMPT_LAZY + select PREEMPT_BUILD if !PREEMPT_DYNAMIC + help + This option provides a scheduler driven preemption model that + is fundamentally similar to full preemption, but is less + eager to preempt SCHED_NORMAL tasks in an attempt to + reduce lock holder preemption and recover some of the performance + gains seen from using Voluntary preemption. + +endchoice + config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT + depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -84,8 +103,6 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. -endchoice - config PREEMPT_COUNT bool @@ -95,7 +112,7 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + depends on HAVE_PREEMPT_DYNAMIC select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD default y if HAVE_PREEMPT_DYNAMIC_CALL diff --git a/kernel/audit.c b/kernel/audit.c index 1edaa4846a47..6a95a6077953 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,7 +123,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; -static u32 audit_sig_sid; +static struct lsm_prop audit_sig_lsm; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -1473,20 +1473,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } case AUDIT_SIGNAL_INFO: len = 0; - if (audit_sig_sid) { - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (lsmprop_is_set(&audit_sig_lsm)) { + err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx, + &len); if (err) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { - if (audit_sig_sid) + if (lsmprop_is_set(&audit_sig_lsm)) security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; - if (audit_sig_sid) { + if (lsmprop_is_set(&audit_sig_lsm)) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } @@ -2102,8 +2103,8 @@ bool audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: length of string (not including trailing null) * @string: string to be logged + * @len: length of string (not including trailing null) * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, @@ -2178,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key) int audit_log_task_context(struct audit_buffer *ab) { + struct lsm_prop prop; char *ctx = NULL; unsigned len; int error; - u32 sid; - security_current_getsecid_subj(&sid); - if (!sid) + security_current_getlsmprop_subj(&prop); + if (!lsmprop_is_set(&prop)) return 0; - error = security_secid_to_secctx(sid, &ctx, &len); + error = security_lsmprop_to_secctx(&prop, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; @@ -2404,7 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_current_getsecid_subj(&audit_sig_sid); + security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); diff --git a/kernel/audit.h b/kernel/audit.h index a60d2840559e..0211cb307d30 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/audit.h> +#include <linux/security.h> #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> #include <linux/tty.h> @@ -81,7 +82,7 @@ struct audit_names { kuid_t uid; kgid_t gid; dev_t rdev; - u32 osid; + struct lsm_prop oprop; struct audit_cap_data fcap; unsigned int fcap_ver; unsigned char type; /* record type */ @@ -143,7 +144,7 @@ struct audit_context { kuid_t target_auid; kuid_t target_uid; unsigned int target_sessionid; - u32 target_sid; + struct lsm_prop target_ref; char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; @@ -160,7 +161,7 @@ struct audit_context { kuid_t uid; kgid_t gid; umode_t mode; - u32 osid; + struct lsm_prop oprop; int has_perm; uid_t perm_uid; gid_t perm_gid; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 470041c49a44..bceb9f58a09e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1339,8 +1339,8 @@ int audit_filter(int msgtype, unsigned int listtype) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; + struct lsm_prop prop = { }; pid_t pid; - u32 sid; switch (f->type) { case AUDIT_PID: @@ -1370,9 +1370,10 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_current_getsecid_subj(&sid); - result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule); + security_current_getlsmprop_subj(&prop); + result = security_audit_rule_match( + &prop, f->type, f->op, + f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cd57053b4a69..91afdd0d036e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -100,7 +100,7 @@ struct audit_aux_data_pids { kuid_t target_auid[AUDIT_AUX_PIDS]; kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; - u32 target_sid[AUDIT_AUX_PIDS]; + struct lsm_prop target_ref[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; int pid_count; }; @@ -470,7 +470,7 @@ static int audit_filter_rules(struct task_struct *tsk, { const struct cred *cred; int i, need_sid = 1; - u32 sid; + struct lsm_prop prop = { }; unsigned int sessionid; if (ctx && rule->prio <= ctx->prio) @@ -674,14 +674,16 @@ static int audit_filter_rules(struct task_struct *tsk, * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still - * use security_current_getsecid_subj() + * use + * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ - security_current_getsecid_subj(&sid); + security_current_getlsmprop_subj(&prop); need_sid = 0; } - result = security_audit_rule_match(sid, f->type, + result = security_audit_rule_match(&prop, + f->type, f->op, f->lsm_rule); } @@ -697,14 +699,14 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, + &name->oprop, f->type, f->op, f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (security_audit_rule_match( - n->osid, + &n->oprop, f->type, f->op, f->lsm_rule)) { @@ -716,7 +718,7 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find ipc objects that match */ if (!ctx || ctx->type != AUDIT_IPC) break; - if (security_audit_rule_match(ctx->ipc.osid, + if (security_audit_rule_match(&ctx->ipc.oprop, f->type, f->op, f->lsm_rule)) ++result; @@ -1017,7 +1019,7 @@ static void audit_reset_context(struct audit_context *ctx) ctx->target_pid = 0; ctx->target_auid = ctx->target_uid = KUIDT_INIT(0); ctx->target_sessionid = 0; - ctx->target_sid = 0; + lsmprop_init(&ctx->target_ref); ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); @@ -1091,8 +1093,9 @@ static inline void audit_free_context(struct audit_context *context) } static int audit_log_pid_context(struct audit_context *context, pid_t pid, - kuid_t auid, kuid_t uid, unsigned int sessionid, - u32 sid, char *comm) + kuid_t auid, kuid_t uid, + unsigned int sessionid, struct lsm_prop *prop, + char *comm) { struct audit_buffer *ab; char *ctx = NULL; @@ -1106,8 +1109,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (sid) { - if (security_secid_to_secctx(sid, &ctx, &len)) { + if (lsmprop_is_set(prop)) { + if (security_lsmprop_to_secctx(prop, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1384,19 +1387,17 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } - case AUDIT_IPC: { - u32 osid = context->ipc.osid; - + case AUDIT_IPC: audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", from_kuid(&init_user_ns, context->ipc.uid), from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); - if (osid) { + if (lsmprop_is_set(&context->ipc.oprop)) { char *ctx = NULL; u32 len; - if (security_secid_to_secctx(osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", osid); + if (security_lsmprop_to_secctx(&context->ipc.oprop, + &ctx, &len)) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", ctx); @@ -1416,7 +1417,7 @@ static void show_special(struct audit_context *context, int *call_panic) context->ipc.perm_gid, context->ipc.perm_mode); } - break; } + break; case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " @@ -1558,13 +1559,11 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (n->osid != 0) { + if (lsmprop_is_set(&n->oprop)) { char *ctx = NULL; u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); + if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) { if (call_panic) *call_panic = 2; } else { @@ -1653,8 +1652,8 @@ static void audit_log_uring(struct audit_context *ctx) audit_log_format(ab, "uring_op=%d", ctx->uring_op); if (ctx->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", - (ctx->return_valid == AUDITSC_SUCCESS ? - "yes" : "no"), + str_yes_no(ctx->return_valid == + AUDITSC_SUCCESS), ctx->return_code); audit_log_format(ab, " items=%d" @@ -1696,8 +1695,8 @@ static void audit_log_exit(void) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", - (context->return_valid == AUDITSC_SUCCESS ? - "yes" : "no"), + str_yes_no(context->return_valid == + AUDITSC_SUCCESS), context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", @@ -1780,7 +1779,7 @@ static void audit_log_exit(void) axs->target_auid[i], axs->target_uid[i], axs->target_sessionid[i], - axs->target_sid[i], + &axs->target_ref[i], axs->target_comm[i])) call_panic = 1; } @@ -1789,7 +1788,7 @@ static void audit_log_exit(void) audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - context->target_sid, context->target_comm)) + &context->target_ref, context->target_comm)) call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { @@ -2278,7 +2277,7 @@ static void audit_copy_inode(struct audit_names *name, name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); + security_inode_getlsmprop(inode, &name->oprop); if (flags & AUDIT_INODE_NOEVAL) { name->fcap_ver = -1; return; @@ -2632,7 +2631,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; context->ipc.has_perm = 0; - security_ipc_getsecid(ipcp, &context->ipc.osid); + security_ipc_getlsmprop(ipcp, &context->ipc.oprop); context->type = AUDIT_IPC; } @@ -2729,7 +2728,7 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &context->target_sid); + security_task_getlsmprop_obj(t, &context->target_ref); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2756,7 +2755,7 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &ctx->target_sid); + security_task_getlsmprop_obj(t, &ctx->target_ref); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2777,7 +2776,7 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); + security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..5cd1c7a23848 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3523,7 +3523,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3543,6 +3543,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3593,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3681,10 +3687,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } @@ -8961,6 +8967,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..025d7e2214ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e07cc057d6f..e303626bdb2f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,7 +21,7 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -40,7 +40,7 @@ #include <linux/execmem.h> #include <asm/barrier.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..3d45ebe8afb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, @@ -2851,21 +2851,47 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) @@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; - const unsigned long *bits; - int bit; + int bit = kit->bit, nr_bits = kit->nr_bits; + const void *bits; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..146f5b57cfb1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..c5aa127ed4cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -3565,15 +3569,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3581,10 +3586,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3609,7 +3621,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3617,7 +3629,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3639,7 +3651,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3648,6 +3660,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3673,12 +3686,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -5877,7 +5896,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..bb99bada7e2e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2750,10 +2750,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) @@ -6333,10 +6339,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -6798,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -7432,7 +7428,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7444,7 +7441,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7452,15 +7449,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7471,7 +7466,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7499,7 +7494,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7507,7 +7501,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7532,6 +7526,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7547,15 +7542,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7575,9 +7567,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7588,13 +7579,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7604,10 +7593,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7633,13 +7620,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -8942,9 +8928,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8959,9 +8944,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9003,7 +8988,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9011,10 +8998,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); @@ -14264,12 +14257,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ @@ -15326,8 +15320,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15338,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); @@ -17877,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17889,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; @@ -21201,7 +21203,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ @@ -22310,7 +22312,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22546,6 +22548,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5886b95c6eae..9bc4a84bd309 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; @@ -6476,7 +6476,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; - struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); @@ -6493,14 +6492,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return 0; } - f = fget_raw(kargs->cgroup); - if (!f) { + CLASS(fd_raw, f)(kargs->cgroup); + if (fd_empty(f)) { ret = -EBADF; goto err; } - sb = f->f_path.dentry->d_sb; + sb = fd_file(f)->f_path.dentry->d_sb; - dst_cgrp = cgroup_get_from_file(f); + dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; @@ -6548,15 +6547,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) } put_css_set(cset); - fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); - if (f) - fput(f); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); @@ -6966,14 +6962,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { - struct cgroup *cgrp; - struct fd f = fdget_raw(fd); - if (!fd_file(f)) + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(fd_file(f)); - fdput(f); - return cgrp; + return cgroup_v1v2_get_from_file(fd_file(f)); } /** diff --git a/kernel/cpu.c b/kernel/cpu.c index 895f3287e3f3..6e34b52cb5ce 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1338,6 +1338,8 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); + lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu)); + /* * Callbacks must be re-integrated right away to the RCU state machine. * Otherwise an RCU callback could block a further teardown function diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 9d34d2364b5a..f625172d4b67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -33,7 +33,7 @@ #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "debug_core.h" #define KGDB_MAX_THREAD_QUERY 17 diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5b6934e23c21..e33691d5adf7 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 2e0f75bcb7fd..8485f63863af 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return -EINTR; } - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) @@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return ret; ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index e3589c4287cb..5d4a54f50826 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -966,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) - goto out; + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -995,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } @@ -2146,7 +2142,7 @@ static void perf_put_aux_event(struct perf_event *event) static bool perf_need_aux_event(struct perf_event *event) { - return !!event->attr.aux_output || !!event->attr.aux_sample_size; + return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, @@ -2171,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event, !perf_aux_output_match(event, group_leader)) return 0; + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; @@ -5998,18 +5998,9 @@ EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; -static inline int perf_fget_light(int fd, struct fd *p) +static inline bool is_perf_file(struct fd f) { - struct fd f = fdget(fd); - if (!fd_file(f)) - return -EBADF; - - if (fd_file(f)->f_op != &perf_fops) { - fdput(f); - return -EBADF; - } - *p = f; - return 0; + return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, @@ -6057,20 +6048,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_OUTPUT: { - int ret; + CLASS(fd, output)(arg); // arg == -1 => empty + struct perf_event *output_event = NULL; if (arg != -1) { - struct perf_event *output_event; - struct fd output; - ret = perf_fget_light(arg, &output); - if (ret) - return ret; + if (!is_perf_file(output)) + return -EBADF; output_event = fd_file(output)->private_data; - ret = perf_event_set_output(event, output_event); - fdput(output); - } else { - ret = perf_event_set_output(event, NULL); } - return ret; + return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: @@ -7022,6 +7007,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} + static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) @@ -7839,7 +7847,7 @@ void perf_prepare_sample(struct perf_sample_data *data, __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); + data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } @@ -8003,7 +8011,7 @@ void perf_prepare_header(struct perf_event_header *header, { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); - header->misc = perf_misc_flags(regs); + header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do @@ -8016,6 +8024,49 @@ void perf_prepare_header(struct perf_event_header *header, WARN_ON_ONCE(header->size & 7); } +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, @@ -9251,7 +9302,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->on_rq) { + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } @@ -9818,9 +9869,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && !bpf_overflow_handler(event, data, regs)) - return ret; + goto out; /* * XXX event_limit might not quite work as expected on inherited @@ -9882,6 +9936,9 @@ static int __perf_event_overflow(struct perf_event *event, event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } @@ -12273,11 +12330,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } if (event->attr.aux_output && - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } + if (event->attr.aux_pause && event->attr.aux_resume) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + event->hw.aux_paused = 1; + } + if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) @@ -12664,7 +12735,6 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = EMPTY_FD; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -12735,10 +12805,12 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { - err = perf_fget_light(group_fd, &group); - if (err) + if (!is_perf_file(group)) { + err = -EBADF; goto err_fd; + } group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; @@ -12750,7 +12822,7 @@ SYSCALL_DEFINE5(perf_event_open, task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); - goto err_group_fd; + goto err_fd; } } @@ -13017,12 +13089,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). + * File reference in group guarantees that group_leader has been + * kept alive until we place the new event on the sibling_list. + * This ensures destruction of the group leader will find + * the pointer to itself in perf_group_detach(). */ - fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -13041,8 +13112,6 @@ err_alloc: err_task: if (task) put_task_struct(task); -err_group_fd: - fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -13073,7 +13142,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ - if (attr->aux_output) + if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, @@ -13959,7 +14028,7 @@ static void perf_event_clear_cpumask(unsigned int cpu) } /* migrate */ - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + list_for_each_entry(pmu, &pmus, entry) { if (pmu->scope == PERF_PMU_SCOPE_NONE || WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) continue; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index e072d995d670..249288d82b8d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -52,6 +52,7 @@ struct perf_buffer { void (*free_aux)(void *); refcount_t aux_refcount; int aux_in_sampling; + int aux_in_pause_resume; void **aux_pages; void *aux_priv; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ec796e2f055..a76ddc5fc982 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,9 @@ #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> +#include <linux/srcu.h> #include <linux/uprobes.h> @@ -42,8 +45,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); -DEFINE_STATIC_SRCU(uprobes_srcu); - #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; @@ -51,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); +/* Covers return_instance's uprobe lifetime. */ +DEFINE_STATIC_SRCU(uretprobes_srcu); + /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -62,10 +66,13 @@ struct uprobe { struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; loff_t ref_ctr_offset; - unsigned long flags; + unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type @@ -100,7 +107,6 @@ static LIST_HEAD(delayed_uprobe_list); */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; @@ -620,17 +626,23 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) return !RB_EMPTY_NODE(&uprobe->rb_node); } -static void uprobe_free_rcu(struct rcu_head *rcu) +static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } -static void put_uprobe(struct uprobe *uprobe) +static void uprobe_free_srcu(struct rcu_head *rcu) { - if (!refcount_dec_and_test(&uprobe->ref)) - return; + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); +} + +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); @@ -651,7 +663,162 @@ static void put_uprobe(struct uprobe *uprobe) delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); - call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); + /* start srcu -> rcu_tasks_trace -> kfree chain */ + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + +/* Initialize hprobe as SRCU-protected "leased" uprobe */ +static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) +{ + WARN_ON(!uprobe); + hprobe->state = HPROBE_LEASED; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = srcu_idx; +} + +/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ +static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) +{ + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = -1; +} + +/* + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be + * used only once for a given hprobe. + * + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever + * is appropriate. + */ +static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) +{ + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); + switch (*hstate) { + case HPROBE_LEASED: + case HPROBE_STABLE: + return hprobe->uprobe; + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ + return NULL; + default: + WARN(1, "hprobe invalid state %d", *hstate); + return NULL; + } +} + +/* + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. + * hprobe_finalize() can only be used from current context after + * hprobe_consume() call (which determines uprobe and hstate value). + */ +static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) +{ + switch (hstate) { + case HPROBE_LEASED: + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + break; + case HPROBE_STABLE: + put_uprobe(hprobe->uprobe); + break; + case HPROBE_GONE: + case HPROBE_CONSUMED: + break; + default: + WARN(1, "hprobe invalid state %d", hstate); + break; + } +} + +/* + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of + * them can win the race to perform SRCU unlocking. Whoever wins must perform + * SRCU unlock. + * + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe + * to begin with or we failed to bump its refcount and it's going away. + * + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has + * an extra refcount for caller to assume and use. Otherwise, it's not + * guaranteed that returned uprobe has a positive refcount, so caller has to + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current + * SRCU lock region. See dup_utask(). + */ +static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) +{ + enum hprobe_state hstate; + + /* + * return_instance's hprobe is protected by RCU. + * Underlying uprobe is itself protected from reuse by SRCU. + */ + lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + + hstate = READ_ONCE(hprobe->state); + switch (hstate) { + case HPROBE_STABLE: + /* uprobe has positive refcount, bump refcount, if necessary */ + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; + case HPROBE_GONE: + /* + * SRCU was unlocked earlier and we didn't manage to take + * uprobe refcnt, so it's effectively NULL + */ + return NULL; + case HPROBE_CONSUMED: + /* + * uprobe was consumed, so it's effectively NULL as far as + * uretprobe processing logic is concerned + */ + return NULL; + case HPROBE_LEASED: { + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); + /* + * Try to switch hprobe state, guarding against + * hprobe_consume() or another hprobe_expire() racing with us. + * Note, if we failed to get uprobe refcount, we use special + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't + * be used as it will be freed after SRCU is unlocked. + */ + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { + /* We won the race, we are the ones to unlock SRCU */ + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + return get ? get_uprobe(uprobe) : uprobe; + } + + /* + * We lost the race, undo refcount bump (if it ever happened), + * unless caller would like an extra refcount anyways. + */ + if (uprobe && !get) + put_uprobe(uprobe); + /* + * Even if hprobe_consume() or another hprobe_expire() wins + * the state update race and unlocks SRCU from under us, we + * still have a guarantee that underyling uprobe won't be + * freed due to ongoing caller's SRCU lock region, so we can + * return it regardless. Also, if `get` was true, we also have + * an extra ref for the caller to own. This is used in dup_utask(). + */ + return uprobe; + } + default: + WARN(1, "unknown hprobe state %d", hstate); + return NULL; + } } static __always_inline @@ -706,7 +873,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) struct rb_node *node; unsigned int seq; - lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); + lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); @@ -825,8 +992,11 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { + static atomic64_t id; + down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); + uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } @@ -934,8 +1104,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1156,7 +1325,8 @@ void uprobe_unregister_sync(void) * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ - synchronize_srcu(&uprobes_srcu); + synchronize_rcu_tasks_trace(); + synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); @@ -1240,19 +1410,18 @@ EXPORT_SYMBOL_GPL(uprobe_register); int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; - int ret = -ENOENT, srcu_idx; + int ret = -ENOENT; down_write(&uprobe->register_rwsem); - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + rcu_read_lock_trace(); + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); @@ -1475,9 +1644,15 @@ static vm_fault_t xol_fault(const struct vm_special_mapping *sm, return 0; } +static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, + .mremap = xol_mremap, }; /* Slot allocation for XOL */ @@ -1545,7 +1720,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); + area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; @@ -1553,7 +1728,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - atomic_set(&area->slot_count, 1); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); @@ -1626,92 +1800,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) } } -/* - * - search for a free slot. - */ -static unsigned long xol_take_insn_slot(struct xol_area *area) +static unsigned long xol_get_slot_nr(struct xol_area *area) { - unsigned long slot_addr; - int slot_nr; - - do { - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); - if (slot_nr < UINSNS_PER_PAGE) { - if (!test_and_set_bit(slot_nr, area->bitmap)) - break; + unsigned long slot_nr; - slot_nr = UINSNS_PER_PAGE; - continue; - } - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); - } while (slot_nr >= UINSNS_PER_PAGE); - - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); - atomic_inc(&area->slot_count); + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + return slot_nr; + } - return slot_addr; + return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. - * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe) +static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { - struct xol_area *area; - unsigned long xol_vaddr; + struct xol_area *area = get_xol_area(); + unsigned long slot_nr; - area = get_xol_area(); if (!area) - return 0; + return false; - xol_vaddr = xol_take_insn_slot(area); - if (unlikely(!xol_vaddr)) - return 0; + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); - arch_uprobe_copy_ixol(area->page, xol_vaddr, + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - - return xol_vaddr; + return true; } /* - * xol_free_insn_slot - If slot was earlier allocated by - * @xol_get_insn_slot(), make the slot available for - * subsequent requests. + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ -static void xol_free_insn_slot(struct task_struct *tsk) +static void xol_free_insn_slot(struct uprobe_task *utask) { - struct xol_area *area; - unsigned long vma_end; - unsigned long slot_addr; - - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) - return; + struct xol_area *area = current->mm->uprobes_state.xol_area; + unsigned long offset = utask->xol_vaddr - area->vaddr; + unsigned int slot_nr; - slot_addr = tsk->utask->xol_vaddr; - if (unlikely(!slot_addr)) + utask->xol_vaddr = 0; + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; - area = tsk->mm->uprobes_state.xol_area; - vma_end = area->vaddr + PAGE_SIZE; - if (area->vaddr <= slot_addr && slot_addr < vma_end) { - unsigned long offset; - int slot_nr; - - offset = slot_addr - area->vaddr; - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; - if (slot_nr >= UINSNS_PER_PAGE) - return; - - clear_bit(slot_nr, area->bitmap); - atomic_dec(&area->slot_count); - smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ - if (waitqueue_active(&area->wq)) - wake_up(&area->wq); - - tsk->utask->xol_vaddr = 0; - } + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + clear_bit(slot_nr, area->bitmap); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, @@ -1750,11 +1889,18 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri) +static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) { struct return_instance *next = ri->next; - put_uprobe(ri->uprobe); - kfree(ri); + + if (cleanup_hprobe) { + enum hprobe_state hstate; + + (void)hprobe_consume(&ri->hprobe, &hstate); + hprobe_finalize(&ri->hprobe, hstate); + } + + kfree_rcu(ri, rcu); return next; } @@ -1770,18 +1916,50 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; - if (utask->active_uprobe) - put_uprobe(utask->active_uprobe); + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); + + timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); - xol_free_insn_slot(t); kfree(utask); t->utask = NULL; } +#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + +#define for_each_ret_instance_rcu(pos, head) \ + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + +static void ri_timer(struct timer_list *timer) +{ + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + struct return_instance *ri; + + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + guard(srcu)(&uretprobes_srcu); + /* RCU protects return_instance from freeing. */ + guard(rcu)(); + + for_each_ret_instance_rcu(ri, utask->return_instances) + hprobe_expire(&ri->hprobe, false); +} + +static struct uprobe_task *alloc_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof(*utask), GFP_KERNEL); + if (!utask) + return NULL; + + timer_setup(&utask->ri_timer, ri_timer, 0); + + return utask; +} + /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. @@ -1793,38 +1971,73 @@ void uprobe_free_utask(struct task_struct *t) static struct uprobe_task *get_utask(void) { if (!current->utask) - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + current->utask = alloc_utask(); return current->utask; } +static size_t ri_size(int consumers_cnt) +{ + struct return_instance *ri; + + return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; +} + +#define DEF_CNT 4 + +static struct return_instance *alloc_return_instance(void) +{ + struct return_instance *ri; + + ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + if (!ri) + return ZERO_SIZE_PTR; + + ri->consumers_cnt = DEF_CNT; + return ri; +} + +static struct return_instance *dup_return_instance(struct return_instance *old) +{ + size_t size = ri_size(old->consumers_cnt); + + return kmemdup(old, size, GFP_KERNEL); +} + static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; + struct uprobe *uprobe; - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ + guard(srcu)(&uretprobes_srcu); + p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + n = dup_return_instance(o); if (!n) return -ENOMEM; - *n = *o; + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ + uprobe = hprobe_expire(&o->hprobe, true); + /* - * uprobe's refcnt has to be positive at this point, kept by - * utask->return_instances items; return_instances can't be - * removed right now, as task is blocked due to duping; so - * get_uprobe() is safe to use here. + * New utask will have stable properly refcounted uprobe or + * NULL. Even if we failed to get refcounted uprobe, we still + * need to preserve full set of return_instances for proper + * uretprobe handling and nesting in forked task. */ - get_uprobe(n->uprobe); - n->next = NULL; + hprobe_init_stable(&n->hprobe, uprobe); - *p = n; + n->next = NULL; + rcu_assign_pointer(*p, n); p = &n->next; + n_utask->depth++; } @@ -1900,45 +2113,34 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained, enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); utask->depth--; } - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); } -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, + struct return_instance *ri) { - struct return_instance *ri; - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; + int srcu_idx; if (!get_xol_area()) - return; - - utask = get_utask(); - if (!utask) - return; + goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); - return; + goto free; } - /* we need to bump refcount to store uprobe in utask */ - if (!try_get_uprobe(uprobe)) - return; - - ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!ri) - goto fail; - trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) - goto fail; + goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); @@ -1956,53 +2158,51 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * attack from user-space. */ uprobe_warn(current, "handle tail call"); - goto fail; + goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - ri->uprobe = uprobe; + + /* __srcu_read_lock() because SRCU lock survives switch to user space */ + srcu_idx = __srcu_read_lock(&uretprobes_srcu); + ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; + + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); + + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; -fail: +free: kfree(ri); - put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - struct uprobe_task *utask; - unsigned long xol_vaddr; + struct uprobe_task *utask = current->utask; int err; - utask = get_utask(); - if (!utask) - return -ENOMEM; - if (!try_get_uprobe(uprobe)) return -EINVAL; - xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) { + if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } - utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { - xol_free_insn_slot(current); + xol_free_insn_slot(utask); goto err_out; } @@ -2125,35 +2325,90 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } +static struct return_instance* +push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +{ + if (unlikely(ri == ZERO_SIZE_PTR)) + return ri; + + if (unlikely(idx >= ri->consumers_cnt)) { + struct return_instance *old_ri = ri; + + ri->consumers_cnt += DEF_CNT; + ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); + if (!ri) { + kfree(old_ri); + return ZERO_SIZE_PTR; + } + } + + ri->consumers[idx].id = id; + ri->consumers[idx].cookie = cookie; + return ri; +} + +static struct return_consumer * +return_consumer_find(struct return_instance *ri, int *iter, int id) +{ + struct return_consumer *ric; + int idx = *iter; + + for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + if (ric->id == id) { + *iter = idx + 1; + return ric; + } + } + return NULL; +} + +static bool ignore_ret_handler(int rc) +{ + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; +} + static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - int remove = UPROBE_HANDLER_REMOVE; - bool need_prep = false; /* prepare return uprobe, when needed */ - bool has_consumers = false; + bool has_consumers = false, remove = true; + struct return_instance *ri = NULL; + int push_idx = 0; current->utask->auprobe = &uprobe->arch; - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + __u64 cookie = 0; int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); - WARN(rc & ~UPROBE_HANDLER_MASK, + rc = uc->handler(uc, regs, &cookie); + WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } - if (uc->ret_handler) - need_prep = true; - - remove &= rc; + remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; + + if (!uc->ret_handler || ignore_ret_handler(rc)) + continue; + + if (!ri) + ri = alloc_return_instance(); + + if (session) + ri = push_consumer(ri, push_idx++, uc->id, cookie); } current->utask->auprobe = NULL; - if (need_prep && !remove) - prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (!ZERO_OR_NULL_PTR(ri)) { + /* + * The push_idx value has the final number of return consumers, + * and ri->consumers_cnt has number of allocated consumers. + */ + ri->consumers_cnt = push_idx; + prepare_uretprobe(uprobe, regs, ri); + } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2169,19 +2424,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) } static void -handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { - struct uprobe *uprobe = ri->uprobe; + struct return_consumer *ric; struct uprobe_consumer *uc; - int srcu_idx; + int ric_idx = 0; + + /* all consumers unsubscribed meanwhile */ + if (unlikely(!uprobe)) + return; - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { - if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + rcu_read_lock_trace(); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + + if (uc->ret_handler) { + ric = return_consumer_find(ri, &ric_idx, uc->id); + if (!session || ric) + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); + } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2200,6 +2463,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + struct uprobe *uprobe; + enum hprobe_state hstate; bool valid; utask = current->utask; @@ -2230,21 +2495,24 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - utask->return_instances = ri->next; + rcu_assign_pointer(utask->return_instances, ri->next); + + uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); + handle_uretprobe_chain(ri, uprobe, regs); + hprobe_finalize(&ri->hprobe, hstate); + + /* We already took care of hprobe, no need to waste more time on that. */ + ri = free_ret_instance(ri, false /* !cleanup_hprobe */); utask->depth--; } while (ri != next); } while (!valid); - utask->return_instances = ri; return; - sigill: +sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); - } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -2266,13 +2534,13 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp, srcu_idx; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - srcu_idx = srcu_read_lock(&uprobes_srcu); + rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { @@ -2330,7 +2598,7 @@ static void handle_swbp(struct pt_regs *regs) out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } /* @@ -2353,7 +2621,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - xol_free_insn_slot(current); + xol_free_insn_slot(utask); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..1dcddfe537ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> diff --git a/kernel/fork.c b/kernel/fork.c index c2bd8367a850..e58d27c05788 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,6 +105,7 @@ #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> +#include <linux/tick.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -653,11 +654,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - khugepaged_fork(mm, oldmm); - /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) @@ -760,6 +756,8 @@ loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); } else if (mpnt) { /* * The entire maple tree has already been duplicated. If the @@ -775,7 +773,10 @@ out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); - dup_userfaultfd_complete(&uf); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -1184,7 +1185,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->active_memcg = NULL; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif @@ -1298,7 +1299,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; - if (mm_alloc_cid(mm)) + if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, @@ -1756,33 +1757,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -2296,6 +2294,7 @@ __latent_entropy struct task_struct *copy_process( acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); + tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); @@ -3239,17 +3238,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3307,7 +3305,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3399,7 +3397,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error; diff --git a/kernel/freezer.c b/kernel/freezer.c index 44bbd7dbd2c8..8d530d0949ff 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); - if (p->on_rq) + /* + * Allow freezing the sched_delayed tasks; they will not execute until + * ttwu() fixes them up, so it is safe to swap their state now, instead + * of waiting for them to get fully dequeued. + */ + if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index fb7214c7a36f..326bfe6549d7 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -181,12 +181,12 @@ static u64 get_inode_sequence_number(struct inode *inode) return old; for (;;) { - u64 new = atomic64_add_return(1, &i_seq); + u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; - old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); - if (old) + old = 0; + if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 5722467f2737..d62cca5ed8f4 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -922,6 +922,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; + DEFINE_WAKE_Q(wake_q); int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) @@ -1018,8 +1019,11 @@ retry_private: * such that futex_unlock_pi() is guaranteed to observe the waiter when * it sees the futex_q::pi_state. */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + preempt_disable(); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + wake_up_q(&wake_q); + preempt_enable(); if (ret) { if (ret == 1) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index b3e98668f4dd..eb16a58e0322 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -141,9 +141,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + WARN_ON(devres_release(dev, devm_irq_release, devm_irq_match, &match_data)); - free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1dee88ba0ae4..0253e77fcd9a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> +#include <linux/string_choices.h> #include "internals.h" @@ -138,8 +139,30 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc_smp_init(desc, node, affinity); } -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); +static unsigned int nr_irqs = NR_IRQS; + +/** + * irq_get_nr_irqs() - Number of interrupts supported by the system. + */ +unsigned int irq_get_nr_irqs(void) +{ + return nr_irqs; +} +EXPORT_SYMBOL_GPL(irq_get_nr_irqs); + +/** + * irq_set_nr_irqs() - Set the number of interrupts supported by the system. + * @nr: New number of interrupts. + * + * Return: @nr. + */ +unsigned int irq_set_nr_irqs(unsigned int nr) +{ + nr_irqs = nr; + + return nr; +} +EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, @@ -298,8 +321,7 @@ static ssize_t wakeup_show(struct kobject *kobj, ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", - irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled"); + ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); raw_spin_unlock_irq(&desc->lock); return ret; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0bff21f30e0..ec6d8e72d980 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1225,7 +1225,7 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, affinity); } else { - hint = hwirq % nr_irqs; + hint = hwirq % irq_get_nr_irqs(); if (hint == 0) hint++; virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3a24d6b5f559..396a067a8a56 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); if (ret < 0) { if (ops->msi_free) { - for (i--; i > 0; i--) + for (i--; i >= 0; i--) ops->msi_free(domain, info, virq + i); } irq_domain_free_irqs_top(domain, virq, nr_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9081ada81c3d..f36c33bd2da4 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -457,11 +457,12 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) } #ifndef ACTUAL_NR_IRQS -# define ACTUAL_NR_IRQS nr_irqs +# define ACTUAL_NR_IRQS irq_get_nr_irqs() #endif int show_interrupts(struct seq_file *p, void *v) { + const unsigned int nr_irqs = irq_get_nr_irqs(); static int prec; int i = *(loff_t *) v, j; @@ -494,9 +495,12 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) goto outsparse; - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0); + seq_printf(p, "%*d:", prec, i); + for_each_online_cpu(j) { + unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0; + + seq_put_decimal_ull_width(p, " ", cnt, 10); + } raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b0639f21041f..2c596851f8a9 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, idx); - rcu_read_unlock(); + file = fget_task(task, idx); if (file) fput(file); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 53b21ae30e00..2af39ba5b70b 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -46,14 +46,8 @@ static struct { int used; /* number of elements used */ bool sorted; /* if elements are sorted */ bool whitelist; /* if list is a blacklist or whitelist */ -} report_filterlist = { - .addrs = NULL, - .size = 8, /* small initial size */ - .used = 0, - .sorted = false, - .whitelist = false, /* default is blacklist */ -}; -static DEFINE_SPINLOCK(report_filterlist_lock); +} report_filterlist; +static DEFINE_RAW_SPINLOCK(report_filterlist_lock); /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run @@ -110,7 +104,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) return false; func_addr -= offset; /* Get function start */ - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); if (report_filterlist.used == 0) goto out; @@ -127,7 +121,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) ret = !ret; out: - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); return ret; } @@ -135,9 +129,9 @@ static void set_report_filterlist_whitelist(bool whitelist) { unsigned long flags; - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); report_filterlist.whitelist = whitelist; - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); } /* Returns 0 on success, error-code otherwise. */ @@ -145,6 +139,9 @@ static ssize_t insert_report_filterlist(const char *func) { unsigned long flags; unsigned long addr = kallsyms_lookup_name(func); + unsigned long *delay_free = NULL; + unsigned long *new_addrs = NULL; + size_t new_size = 0; ssize_t ret = 0; if (!addr) { @@ -152,42 +149,42 @@ static ssize_t insert_report_filterlist(const char *func) return -ENOENT; } - spin_lock_irqsave(&report_filterlist_lock, flags); +retry_alloc: + /* + * Check if we need an allocation, and re-validate under the lock. Since + * the report_filterlist_lock is a raw, cannot allocate under the lock. + */ + if (data_race(report_filterlist.used == report_filterlist.size)) { + new_size = (report_filterlist.size ?: 4) * 2; + delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL); + if (!new_addrs) + return -ENOMEM; + } - if (report_filterlist.addrs == NULL) { - /* initial allocation */ - report_filterlist.addrs = - kmalloc_array(report_filterlist.size, - sizeof(unsigned long), GFP_ATOMIC); - if (report_filterlist.addrs == NULL) { - ret = -ENOMEM; - goto out; - } - } else if (report_filterlist.used == report_filterlist.size) { - /* resize filterlist */ - size_t new_size = report_filterlist.size * 2; - unsigned long *new_addrs = - krealloc(report_filterlist.addrs, - new_size * sizeof(unsigned long), GFP_ATOMIC); - - if (new_addrs == NULL) { - /* leave filterlist itself untouched */ - ret = -ENOMEM; - goto out; + raw_spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == report_filterlist.size) { + /* Check we pre-allocated enough, and retry if not. */ + if (report_filterlist.used >= new_size) { + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); + kfree(new_addrs); /* kfree(NULL) is safe */ + delay_free = new_addrs = NULL; + goto retry_alloc; } + if (report_filterlist.used) + memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long)); + delay_free = report_filterlist.addrs; /* free the old list */ + report_filterlist.addrs = new_addrs; /* switch to the new list */ report_filterlist.size = new_size; - report_filterlist.addrs = new_addrs; } /* Note: deduplicating should be done in userspace. */ - report_filterlist.addrs[report_filterlist.used++] = - kallsyms_lookup_name(func); + report_filterlist.addrs[report_filterlist.used++] = addr; report_filterlist.sorted = false; -out: - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); + kfree(delay_free); return ret; } @@ -204,13 +201,13 @@ static int show_info(struct seq_file *file, void *v) } /* show filter functions, and filter type */ - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); seq_printf(file, "\n%s functions: %s\n", report_filterlist.whitelist ? "whitelisted" : "blacklisted", report_filterlist.used == 0 ? "none" : ""); for (i = 0; i < report_filterlist.used; ++i) seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..9bb36897b6c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); + if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) + return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 536bd471557f..2d8ec0351ef9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4586,6 +4586,30 @@ void lockdep_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } +/** + * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped + * + * @cpu: index of offlined CPU + * @idle: task pointer for offlined CPU's idle thread + * + * Invoked after the CPU is dead. Ensures that the tracing infrastructure + * is left in a suitable state for the CPU to be subsequently brought + * online again. + */ +void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle) +{ + if (unlikely(!debug_locks)) + return; + + if (unlikely(per_cpu(hardirqs_enabled, cpu))) { + pr_warn("CPU %u left hardirqs enabled!", cpu); + if (idle) + print_irqtrace_events(idle); + /* Clean it up for when the CPU comes online again. */ + per_cpu(hardirqs_enabled, cpu) = 0; + } +} + static int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { @@ -6576,17 +6600,17 @@ EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %zu kB\n", + pr_info(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + sizeof(lock_classes_in_use) + sizeof(classhash_table) + @@ -6604,12 +6628,12 @@ void __init lockdep_init(void) ); #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - printk(" memory used for stack traces: %zu kB\n", + pr_info(" memory used for stack traces: %zu kB\n", (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 ); #endif - printk(" per task-struct memory footprint: %zu bytes\n", + pr_info(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cbae8c0b89ab..3302e52f0c96 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -56,31 +56,6 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) } EXPORT_SYMBOL(__mutex_init); -/* - * @owner: contains: 'struct task_struct *' to the current lock owner, - * NULL means not owned. Since task_struct pointers are aligned at - * at least L1_CACHE_BYTES, we have low bits to store extra state. - * - * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. - * Bit1 indicates unlock needs to hand the lock to the top-waiter - * Bit2 indicates handoff has been done and we're waiting for pickup. - */ -#define MUTEX_FLAG_WAITERS 0x01 -#define MUTEX_FLAG_HANDOFF 0x02 -#define MUTEX_FLAG_PICKUP 0x04 - -#define MUTEX_FLAGS 0x07 - -/* - * Internal helper function; C doesn't allow us to hide it :/ - * - * DO NOT USE (outside of mutex code). - */ -static inline struct task_struct *__mutex_owner(struct mutex *lock) -{ - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); -} - static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); @@ -575,8 +550,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { + DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; struct ww_mutex *ww; + unsigned long flags; int ret; if (!use_ww_ctx) @@ -619,13 +596,13 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas return 0; } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* * After waiting to acquire the wait_lock, try again. */ if (__mutex_trylock(lock)) { if (ww_ctx) - __ww_mutex_check_waiters(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); goto skip_wait; } @@ -645,7 +622,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * Add in stamp order, waking up waiters that must kill * themselves. */ - ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q); if (ret) goto err_early_kill; } @@ -680,7 +657,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + /* Make sure we do wakeups before calling schedule */ + wake_up_q(&wake_q); + wake_q_init(&wake_q); + schedule_preempt_disabled(); first = __mutex_waiter_is_first(lock, &waiter); @@ -701,9 +682,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas trace_contention_begin(lock, LCB_F_MUTEX); } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: __set_current_state(TASK_RUNNING); @@ -714,7 +695,7 @@ acquired: */ if (!ww_ctx->is_wait_die && !__mutex_waiter_is_first(lock, &waiter)) - __ww_mutex_check_waiters(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } __mutex_remove_waiter(lock, &waiter); @@ -729,7 +710,8 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); preempt_enable(); return 0; @@ -738,9 +720,10 @@ err: __mutex_remove_waiter(lock, &waiter); err_early_kill: trace_contention_end(lock, ret); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); + wake_up_q(&wake_q); preempt_enable(); return ret; } @@ -908,6 +891,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne struct task_struct *next = NULL; DEFINE_WAKE_Q(wake_q); unsigned long owner; + unsigned long flags; mutex_release(&lock->dep_map, ip); @@ -934,7 +918,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -951,9 +935,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - raw_spin_unlock(&lock->wait_lock); - + preempt_disable(); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); wake_up_q(&wake_q); + preempt_enable(); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 0b2a79c4013b..cbff35b9b7ae 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -20,6 +20,33 @@ struct mutex_waiter { #endif }; +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * at least L1_CACHE_BYTES, we have low bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. + */ +#define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 + +#define MUTEX_FLAGS 0x07 + +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex & scheduler code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + if (!lock) + return NULL; + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 75a6f6133866..b4233dc2c2b0 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -215,8 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg_release(&lock->tail, curr, - OSQ_UNLOCKED_VAL) == curr)) + if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL)) return; /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ac2e22502741..dc1cb90e3644 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -38,13 +38,13 @@ #define PV_PREV_CHECK_MASK 0xff /* - * Queue node uses: vcpu_running & vcpu_halted. - * Queue head uses: vcpu_running & vcpu_hashed. + * Queue node uses: VCPU_RUNNING & VCPU_HALTED. + * Queue head uses: VCPU_RUNNING & VCPU_HASHED. */ enum vcpu_state { - vcpu_running = 0, - vcpu_halted, /* Used only in pv_wait_node */ - vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ + VCPU_RUNNING = 0, + VCPU_HALTED, /* Used only in pv_wait_node */ + VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */ }; struct pv_node { @@ -266,7 +266,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running; + return READ_ONCE(prev->state) != VCPU_RUNNING; } /* @@ -279,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node) BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode)); pn->cpu = smp_processor_id(); - pn->state = vcpu_running; + pn->state = VCPU_RUNNING; } /* @@ -308,26 +308,26 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) /* * Order pn->state vs pn->locked thusly: * - * [S] pn->state = vcpu_halted [S] next->locked = 1 + * [S] pn->state = VCPU_HALTED [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_hashed + * [L] pn->locked [RmW] pn->state = VCPU_HASHED * * Matches the cmpxchg() from pv_kick_node(). */ - smp_store_mb(pn->state, vcpu_halted); + smp_store_mb(pn->state, VCPU_HALTED); if (!READ_ONCE(node->locked)) { lockevent_inc(pv_wait_node); lockevent_cond_inc(pv_wait_early, wait_early); - pv_wait(&pn->state, vcpu_halted); + pv_wait(&pn->state, VCPU_HALTED); } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that + * If pv_kick_node() changed us to VCPU_HASHED, retain that * value so that pv_wait_head_or_lock() knows to not also try * to hash this lock. */ - cmpxchg(&pn->state, vcpu_halted, vcpu_running); + cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING); /* * If the locked flag is still not set after wakeup, it is a @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - u8 old = vcpu_halted; + u8 old = VCPU_HALTED; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will @@ -374,7 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * subsequent writes. */ smp_mb__before_atomic(); - if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed)) + if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED)) return; /* @@ -407,7 +407,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * If pv_kick_node() already advanced our state, we don't need to * insert ourselves into the hash table anymore. */ - if (READ_ONCE(pn->state) == vcpu_hashed) + if (READ_ONCE(pn->state) == VCPU_HASHED) lp = (struct qspinlock **)1; /* @@ -420,7 +420,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * Set correct vCPU state to be used by queue node wait-early * mechanism. */ - WRITE_ONCE(pn->state, vcpu_running); + WRITE_ONCE(pn->state, VCPU_RUNNING); /* * Set the pending bit in the active lock spinning loop to @@ -460,7 +460,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_hashed); + WRITE_ONCE(pn->state, VCPU_HASHED); lockevent_inc(pv_wait_head); lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ebebd0eec7f6..ac1365afcc4a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -34,13 +34,15 @@ static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { return 0; } static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { } @@ -1201,7 +1203,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, struct ww_acquire_ctx *ww_ctx, - enum rtmutex_chainwalk chwalk) + enum rtmutex_chainwalk chwalk, + struct wake_q_head *wake_q) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); - res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + preempt_disable(); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); + wake_up_q(wake_q); + preempt_enable(); if (res) { raw_spin_lock(&task->pi_lock); rt_mutex_dequeue(lock, waiter); @@ -1601,6 +1607,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, unsigned int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct task_struct *owner; @@ -1674,12 +1681,14 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, * @state: The task state for sleeping * @chwalk: Indicator whether full or partial chainwalk is requested * @waiter: Initializer waiter for blocking + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); @@ -1690,7 +1699,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { if (build_ww_mutex() && ww_ctx) { - __ww_mutex_check_waiters(rtm, ww_ctx); + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } return 0; @@ -1700,7 +1709,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, trace_contention_begin(lock, LCB_F_RT); - ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); @@ -1708,7 +1717,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, /* acquired the lock */ if (build_ww_mutex() && ww_ctx) { if (!ww_ctx->is_wait_die) - __ww_mutex_check_waiters(rtm, ww_ctx); + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } } else { @@ -1730,7 +1739,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, - unsigned int state) + unsigned int state, + struct wake_q_head *wake_q) { struct rt_mutex_waiter waiter; int ret; @@ -1739,7 +1749,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, waiter.ww_ctx = ww_ctx; ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, - &waiter); + &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); return ret; @@ -1755,6 +1765,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state) { + DEFINE_WAKE_Q(wake_q); unsigned long flags; int ret; @@ -1776,8 +1787,11 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, * irqsave/restore variants. */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); + preempt_disable(); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); + preempt_enable(); rt_mutex_post_schedule(); return ret; @@ -1803,8 +1817,11 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, /** * rtlock_slowlock_locked - Slow path lock acquisition for RT locks * @lock: The underlying RT mutex + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ -static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, + struct wake_q_head *wake_q) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex_waiter waiter; struct task_struct *owner; @@ -1821,7 +1838,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) trace_contention_begin(lock, LCB_F_RT); - task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); for (;;) { /* Try to acquire the lock again */ @@ -1832,7 +1849,11 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) schedule_rtlock(); @@ -1857,10 +1878,14 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&lock->wait_lock, flags); - rtlock_slowlock_locked(lock); + rtlock_slowlock_locked(lock, &wake_q); + preempt_disable(); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); + preempt_enable(); } #endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index a6974d044593..33ea31d6a7b3 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -175,10 +175,10 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, } /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). + * mark_wakeup_next_waiter() deboosts and retains preemption + * disabled when dropping the wait_lock, to avoid inversion prior + * to the wakeup. preempt_disable() therein pairs with the + * preempt_enable() in rt_mutex_postunlock(). */ mark_wakeup_next_waiter(wqh, lock); @@ -275,6 +275,7 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) * @lock: the rt_mutex to take * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare + * @wake_q: the wake_q to wake tasks after we release the wait_lock * * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. @@ -291,7 +292,8 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) */ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task) + struct task_struct *task, + struct wake_q_head *wake_q) { int ret; @@ -302,7 +304,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, - RT_MUTEX_FULL_CHAINWALK); + RT_MUTEX_FULL_CHAINWALK, wake_q); if (ret && !rt_mutex_owner(lock)) { /* @@ -341,12 +343,16 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct task_struct *task) { int ret; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); if (unlikely(ret)) remove_waiter(lock, waiter); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); return ret; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 1162e07cdaea..c38a2d2d4a7e 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -83,7 +83,8 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task); + struct task_struct *task, + struct wake_q_head *); extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 34a59569db6b..9f4322c07486 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -69,6 +69,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { struct rt_mutex_base *rtm = &rwb->rtmutex; + DEFINE_WAKE_Q(wake_q); int ret; rwbase_pre_schedule(); @@ -110,7 +111,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, * For rwlocks this returns 0 unconditionally, so the below * !ret conditionals are optimized out. */ - ret = rwbase_rtmutex_slowlock_locked(rtm, state); + ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q); /* * On success the rtmutex is held, so there can't be a writer @@ -121,7 +122,12 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, */ if (!ret) atomic_inc(&rwb->readers); + + preempt_disable(); raw_spin_unlock_irq(&rtm->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); + if (!ret) rwbase_rtmutex_unlock(rtm); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2bbb6eca5144..2ddb827e3bea 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1413,8 +1413,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_rtmutex_lock_state(rtm, state) \ __rt_mutex_lock(rtm, state) -#define rwbase_rtmutex_slowlock_locked(rtm, state) \ - __rt_mutex_slowlock_locked(rtm, NULL, state) +#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \ + __rt_mutex_slowlock_locked(rtm, NULL, state, wq) #define rwbase_rtmutex_unlock(rtm) \ __rt_mutex_unlock(rtm) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 438c6086d540..7685defd7c52 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -77,7 +77,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ } \ } \ \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -95,12 +95,12 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ return flags; \ } \ \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ _raw_##op##_lock_irqsave(lock); \ } \ \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 38e292454fcc..db1e11b45de6 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -51,7 +51,7 @@ static __always_inline void __rt_spin_lock(spinlock_t *lock) migrate_disable(); } -void __sched rt_spin_lock(spinlock_t *lock) +void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU) { spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); __rt_spin_lock(lock); @@ -75,7 +75,7 @@ void __sched rt_spin_lock_nest_lock(spinlock_t *lock, EXPORT_SYMBOL(rt_spin_lock_nest_lock); #endif -void __sched rt_spin_unlock(spinlock_t *lock) +void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU) { spin_release(&lock->dep_map, _RET_IP_); migrate_enable(); @@ -162,9 +162,10 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) } static __always_inline int -rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state, + struct wake_q_head *wake_q) { - rtlock_slowlock_locked(rtm); + rtlock_slowlock_locked(rtm, wake_q); return 0; } @@ -225,7 +226,7 @@ int __sched rt_write_trylock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_trylock); -void __sched rt_read_lock(rwlock_t *rwlock) +void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); @@ -235,7 +236,7 @@ void __sched rt_read_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_read_lock); -void __sched rt_write_lock(rwlock_t *rwlock) +void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); @@ -246,7 +247,7 @@ void __sched rt_write_lock(rwlock_t *rwlock) EXPORT_SYMBOL(rt_write_lock); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); @@ -257,7 +258,7 @@ void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) EXPORT_SYMBOL(rt_write_lock_nested); #endif -void __sched rt_read_unlock(rwlock_t *rwlock) +void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU) { rwlock_release(&rwlock->dep_map, _RET_IP_); migrate_enable(); @@ -266,7 +267,7 @@ void __sched rt_read_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_read_unlock); -void __sched rt_write_unlock(rwlock_t *rwlock) +void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU) { rwlock_release(&rwlock->dep_map, _RET_IP_); rcu_read_unlock(); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 10a5736a21c2..5d58b2c0ef98 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags) int ret; ww_mutex_init(&mtx.mutex, &ww_class); - ww_acquire_init(&ctx, &ww_class); + if (flags & TEST_MTX_CTX) + ww_acquire_init(&ctx, &ww_class); INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); init_completion(&mtx.ready); @@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags) ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); } ww_mutex_unlock(&mtx.mutex); - ww_acquire_fini(&ctx); + if (flags & TEST_MTX_CTX) + ww_acquire_fini(&ctx); if (ret) { pr_err("%s(flags=%x): mutual exclusion failure\n", @@ -679,7 +681,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 76d204b7d29c..37f025a096c9 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -70,14 +70,14 @@ __ww_mutex_has_waiters(struct mutex *lock) return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; } -static inline void lock_wait_lock(struct mutex *lock) +static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) { - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, *flags); } -static inline void unlock_wait_lock(struct mutex *lock) +static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) @@ -144,14 +144,14 @@ __ww_mutex_has_waiters(struct rt_mutex *lock) return rt_mutex_has_waiters(&lock->rtmutex); } -static inline void lock_wait_lock(struct rt_mutex *lock) +static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) { - raw_spin_lock(&lock->rtmutex.wait_lock); + raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } -static inline void unlock_wait_lock(struct rt_mutex *lock) +static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) { - raw_spin_unlock(&lock->rtmutex.wait_lock); + raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) @@ -275,7 +275,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) */ static bool __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) { if (!ww_ctx->is_wait_die) return false; @@ -284,7 +284,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } return true; @@ -299,7 +299,8 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, */ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, - struct ww_acquire_ctx *hold_ctx) + struct ww_acquire_ctx *hold_ctx, + struct wake_q_head *wake_q) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -331,7 +332,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * wakeup pending to re-read the wounded state. */ if (owner != current) - wake_up_process(owner); + wake_q_add(wake_q, owner); return true; } @@ -352,7 +353,8 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * The current task must not be on the wait list. */ static void -__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { struct MUTEX_WAITER *cur; @@ -364,8 +366,8 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (__ww_mutex_die(lock, cur, ww_ctx) || - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q)) break; } } @@ -377,6 +379,9 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { + DEFINE_WAKE_Q(wake_q); + unsigned long flags; + ww_mutex_lock_acquired(lock, ctx); /* @@ -404,9 +409,12 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * Uh oh, we raced in fastpath, check if any of the waiters need to * die or wound us. */ - lock_wait_lock(&lock->base); - __ww_mutex_check_waiters(&lock->base, ctx); - unlock_wait_lock(&lock->base); + lock_wait_lock(&lock->base, &flags); + __ww_mutex_check_waiters(&lock->base, ctx, &wake_q); + preempt_disable(); + unlock_wait_lock(&lock->base, &flags); + wake_up_q(&wake_q); + preempt_enable(); } static __always_inline int @@ -488,7 +496,8 @@ __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, static inline int __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; @@ -532,7 +541,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, pos = cur; /* Wait-Die: ensure younger waiters die. */ - __ww_mutex_die(lock, cur, ww_ctx); + __ww_mutex_die(lock, cur, ww_ctx, wake_q); } __ww_waiter_add(lock, waiter, pos); @@ -550,7 +559,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, * such that either we or the fastpath will wound @ww->ctx. */ smp_mb(); - __ww_mutex_wound(lock, ww_ctx, ww->ctx); + __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q); } return 0; diff --git a/kernel/module/dups.c b/kernel/module/dups.c index 9a92f2f8c9d3..bd2149fbe117 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -18,7 +18,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c index 0800d9891692..25f253812512 100644 --- a/kernel/module/kmod.c +++ b/kernel/module/kmod.c @@ -15,7 +15,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/module/main.c b/kernel/module/main.c index 49b9bca9de12..4490924fe24e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3202,7 +3202,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int { struct idempotent idem; - if (!f || !(f->f_mode & FMODE_READ)) + if (!(f->f_mode & FMODE_READ)) return -EBADF; /* Are we the winners of the race and get to do this? */ @@ -3219,10 +3219,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; - struct fd f; - - err = may_init_module(); + int err = may_init_module(); if (err) return err; @@ -3233,10 +3230,10 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - f = fdget(fd); - err = idempotent_init_module(fd_file(f), uargs, flags); - fdput(f); - return err; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + return idempotent_init_module(fd_file(f), uargs, flags); } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index dc952c3b05af..c9d97ed20122 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,12 +545,12 @@ static void commit_nsset(struct nsset *nsset) SYSCALL_DEFINE2(setns, int, fd, int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct ns_common *ns = NULL; struct nsset nsset = {}; int err = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; if (proc_ns_file(fd_file(f))) { @@ -580,7 +580,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) } put_nsset(&nsset); out: - fdput(f); return err; } diff --git a/kernel/padata.c b/kernel/padata.c index d899f34558af..d51bbc76b227 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -521,13 +521,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, 1ul); ps.chunk_size = roundup(ps.chunk_size, job->align); - /* - * chunk_size can be 0 if the caller sets min_chunk to 0. So force it - * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` - */ - if (!ps.chunk_size) - ps.chunk_size = 1U; - list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); diff --git a/kernel/pid.c b/kernel/pid.c index 2715afb77eab..115448e89c3e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -536,11 +536,10 @@ EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { - struct fd f; + CLASS(fd, f)(fd); struct pid *pid; - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); pid = pidfd_pid(fd_file(f)); @@ -548,8 +547,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) get_pid(pid); *flags = fd_file(f)->f_flags; } - - fdput(f); return pid; } @@ -747,23 +744,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; - struct fd f; - int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; - f = fdget(pidfd); - if (!fd_file(f)) + CLASS(fd, f)(pidfd); + if (fd_empty(f)) return -EBADF; pid = pidfd_pid(fd_file(f)); if (IS_ERR(pid)) - ret = PTR_ERR(pid); - else - ret = pidfd_getfd(pid, fd); + return PTR_ERR(pid); - fdput(f); - return ret; + return pidfd_getfd(pid, fd); } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 927cc55ba0b3..d07faf42eace 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, goto unlock; dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev) return em_recalc_and_update(dev, pd, em_table); } EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index beb808f4c367..19911c8fa7b6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -560,10 +560,11 @@ bool printk_percpu_data_ready(void) /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { - raw_write_seqcount_latch(&ls->latch); + write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; - raw_write_seqcount_latch(&ls->latch); + write_seqcount_latch(&ls->latch); ls->val[1] = val; + write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ @@ -574,10 +575,10 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) u64 val; do { - seq = raw_read_seqcount_latch(&ls->latch); + seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); + } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3e079de0f5b4..b9b6bc55185d 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -249,16 +249,24 @@ config RCU_NOCB_CPU workloads will incur significant increases in context-switch rates. - This option offloads callback invocation from the set of CPUs - specified at boot time by the rcu_nocbs parameter. For each - such CPU, a kthread ("rcuox/N") will be created to invoke - callbacks, where the "N" is the CPU being offloaded, and where - the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for - RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread - from running on the specified CPUs, but (1) the kthreads may be - preempted between each callback, and (2) affinity or cgroups can - be used to force the kthreads to run on whatever set of CPUs is - desired. + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "p" for RCU-preempt (PREEMPTION kernels) + and "s" for RCU-sched (!PREEMPTION kernels). This option + also creates another kthread for each sqrt(nr_cpu_ids) CPUs + ("rcuog/N", where N is the first CPU in that group to come + online), which handles grace periods for its group. Nothing + prevents these kthreads from running on the specified CPUs, + but (1) the kthreads may be preempted between each callback, + and (2) affinity or cgroups can be used to force the kthreads + to run on whatever set of CPUs is desired. + + The sqrt(nr_cpu_ids) grouping may be overridden using the + rcutree.rcu_nocb_gp_stride kernel boot parameter. This can + be especially helpful for smaller numbers of CPUs, where + sqrt(nr_cpu_ids) can be a bit of a blunt instrument. Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 259904075636..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -120,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 6d37596deb1f..0f3059b1b80d 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -889,14 +889,14 @@ kfree_scale_init(void) if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bb75dbf5c800..612d27690335 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) +#define RCUTORTURE_RDR_ALLBITS \ + (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ + RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) @@ -108,6 +111,7 @@ torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disab torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -393,6 +397,7 @@ struct rcu_torture_ops { int slow_gps; int no_pi_lock; int debug_objects; + int start_poll_irqsoff; const char *name; }; @@ -581,6 +586,7 @@ static struct rcu_torture_ops rcu_ops = { .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, .debug_objects = 1, + .start_poll_irqsoff = 1, .name = "rcu" }; @@ -641,10 +647,25 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { - if (cur_ops == &srcud_ops) - return srcu_read_lock_nmisafe(srcu_ctlp); - else - return srcu_read_lock(srcu_ctlp); + int idx; + int ret = 0; + + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + idx = srcu_read_lock(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx; + } + if (reader_flavor & 0x2) { + idx = srcu_read_lock_nmisafe(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 1; + } + if (reader_flavor & 0x4) { + idx = srcu_read_lock_lite(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + return ret; } static void @@ -668,10 +689,13 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { - if (cur_ops == &srcud_ops) - srcu_read_unlock_nmisafe(srcu_ctlp, idx); - else - srcu_read_unlock(srcu_ctlp, idx); + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & 0x4) + srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); + if (reader_flavor & 0x2) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); } static int torture_srcu_read_lock_held(void) @@ -1059,8 +1083,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star // At most one persisted message per boost test. j = jiffies; lp = READ_ONCE(last_persist); - if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) - pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } return false; // passed on a technicality } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); @@ -1695,14 +1724,22 @@ rcu_torture_fakewriter(void *arg) cur_ops->cond_sync_exp_full(&gp_snap_full); break; case RTWS_POLL_GET: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); gp_snap = cur_ops->start_gp_poll(); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state(gp_snap)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } break; case RTWS_POLL_GET_FULL: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); cur_ops->start_gp_poll_full(&gp_snap_full); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); @@ -1820,7 +1857,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesold = *readstate & ~newstate; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1835,9 +1872,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU_1) - idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; if (statesnew & RCUTORTURE_RDR_RCU_2) - idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; /* * Next, remove old protection, in decreasing order of strength @@ -1857,7 +1894,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU_2) { - cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); WARN_ON_ONCE(idxnew2 != -1); idxold2 = 0; } @@ -1867,7 +1904,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); WARN_ON_ONCE(idxnew1 != -1); idxold1 = 0; if (lockit) @@ -1882,16 +1919,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (idxnew1 == -1) idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; WARN_ON_ONCE(idxnew1 < 0); - if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) - pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); if (idxnew2 == -1) idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; WARN_ON_ONCE(idxnew2 < 0); - WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); *readstate = idxnew1 | idxnew2 | newstate; WARN_ON_ONCE(*readstate < 0); - if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) - pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1916,7 +1950,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; @@ -2389,6 +2423,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " + "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -2401,6 +2436,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, + reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis); } @@ -2440,6 +2476,14 @@ static int rcutorture_booster_init(unsigned int cpu) WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) { + t = per_cpu(ktimerd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } +#endif } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..aacfcc9838b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -75,6 +75,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); @@ -212,6 +215,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_lite_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static const struct ref_scale_ops srcu_lite_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_lite_ref_scale_read_section, + .delaysection = srcu_lite_ref_scale_delay_section, + .name = "srcu-lite" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -801,6 +834,18 @@ static void rcu_scale_one_reader(void) cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -829,7 +874,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(raw_smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) @@ -957,6 +1002,7 @@ static int main_func(void *arg) schedule_timeout_uninterruptible(1); // Start exp readers up per experiment + rcu_scale_warm_cool(); for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (torture_must_stop()) goto end; @@ -987,6 +1033,7 @@ static int main_func(void *arg) result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } + rcu_scale_warm_cool(); // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); @@ -1082,9 +1129,10 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, - &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, + &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, + &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 549c03336ee9..4dcbf8aa80ff 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -122,8 +122,8 @@ void srcu_drive_gp(struct work_struct *wp) ssp = container_of(wp, struct srcu_struct, srcu_work); preempt_disable(); // Needed for PREEMPT_AUTO if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { - return; /* Already running or nothing to do. */ preempt_enable(); + return; /* Already running or nothing to do. */ } /* Remove recently arrived callbacks and wait for readers. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 31706e3293bc..5e2e53464794 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -128,7 +128,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); @@ -187,7 +187,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; @@ -419,41 +419,60 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Is the current or any upcoming grace period to be expedited? */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) +{ + struct srcu_usage *sup = ssp->srcu_sup; + + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); +} + +/* + * Computes approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx, and returns true if + * the caller did the proper barrier (gp), and if the count of the locks + * matches that of the unlocks passed in. + */ +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - return sum; + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_LITE && !gp) + return false; + return sum == unlocks; } /* * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); - if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); + sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), - "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -463,22 +482,28 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); /* * Make sure that a lock is always counted if the corresponding * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have @@ -536,7 +561,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * which are unlikely to be configured with an address space fully * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -554,12 +579,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[0]); - sum += atomic_long_read(&cpuc->srcu_lock_count[1]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_lock_count[0]); + sum += atomic_long_read(&sdp->srcu_lock_count[1]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); } return sum; } @@ -622,7 +647,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; @@ -687,28 +712,28 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -#ifdef CONFIG_PROVE_RCU /* - * Check for consistent NMI safety. + * Check for consistent reader flavor. */ -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int nmi_safe_mask = 1 << nmi_safe; - int old_nmi_safe_mask; + int old_read_flavor; struct srcu_data *sdp; - /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!nmi_safe && in_nmi()); + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + sdp = raw_cpu_ptr(ssp->sda); - old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); - if (!old_nmi_safe_mask) { - WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); - return; + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) + return; } - WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } -EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); -#endif /* CONFIG_PROVE_RCU */ +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the @@ -867,7 +892,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) cbdelay = 0; WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); @@ -1122,6 +1147,8 @@ static void srcu_flip(struct srcu_struct *ssp) * it stays until either (1) Compilers learn about this sort of * control dependency or (2) Some production workload running on * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ @@ -1139,7 +1166,9 @@ static void srcu_flip(struct srcu_struct *ssp) } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1159,7 +1188,7 @@ static void srcu_flip(struct srcu_struct *ssp) * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -1168,6 +1197,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long tlast; check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); @@ -1469,14 +1501,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Implementation of these memory-ordering guarantees is similar to * that of synchronize_rcu(). * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6333f4ccf024..59314da5eb60 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t) return false; /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + + /* * Idle tasks (or idle injection) within the idle loop are RCU-tasks * quiescent states. But CPU boot code performed by the idle task * isn't a quiescent state. @@ -1398,7 +1407,8 @@ static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) */ void synchronize_rcu_tasks_rude(void) { - synchronize_rcu_tasks_generic(&rcu_tasks_rude); + if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU)) + synchronize_rcu_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); @@ -1540,22 +1550,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) { - union rcu_special ret; - union rcu_special trs_old = READ_ONCE(t->trc_reader_special); - union rcu_special trs_new = trs_old; - - if (trs_old.b.need_qs != old) - return trs_old.b.need_qs; - trs_new.b.need_qs = new; - - // Although cmpxchg() appears to KCSAN to update all four bytes, - // only the .b.need_qs byte actually changes. - instrument_atomic_read_write(&t->trc_reader_special.b.need_qs, - sizeof(t->trc_reader_special.b.need_qs)); - // Avoid false-positive KCSAN failures. - ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s)); - - return ret.b.need_qs; + return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a60616e69b66..ff98233d4aa5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3511,7 +3511,7 @@ static int krc_count(struct kfree_rcu_cpu *krcp) } static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; @@ -3526,6 +3526,16 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) } static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static void kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) { struct list_head bulk_ready[FREE_N_CHANNELS]; @@ -3607,11 +3617,12 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) } // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; } } @@ -3835,7 +3846,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_monitor_work(krcp); + __schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -4193,7 +4204,6 @@ static void start_poll_synchronize_rcu_common(void) struct rcu_data *rdp; struct rcu_node *rnp; - lockdep_assert_irqs_enabled(); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; @@ -4218,9 +4228,6 @@ static void start_poll_synchronize_rcu_common(void) * grace period has elapsed in the meantime. If the needed grace period * is not already slated to start, notifies RCU core of the need for that * grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ unsigned long start_poll_synchronize_rcu(void) { @@ -4241,9 +4248,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * grace period (whether normal or expedited) has elapsed in the meantime. * If the needed grace period is not already slated to start, notifies * RCU core of the need for that grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { @@ -5579,8 +5583,7 @@ void rcu_init_geometry(void) * Complain and fall back to the compile-time values if this * limit is exceeded. */ - if (rcu_fanout_leaf < 2 || - rcu_fanout_leaf > sizeof(unsigned long) * 8) { + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 97b99cd06923..2605dd234a13 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -554,13 +554,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags)) { + } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + /* + * Don't do the wake-up upfront on fragile paths. + * Also offline CPUs can't call swake_up_one_online() from + * (soft-)IRQs. Rely on the final deferred wake-up from + * rcutree_report_cpu_dead() + */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); @@ -885,7 +891,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); if (kthread_should_park()) { - kthread_parkme(); + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); + } } else if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c7cbd145d5e..3927ea5f7955 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) switch (blkd_state) { case 0: case RCU_EXP_TASKS: - case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: - case RCU_GP_TASKS + RCU_EXP_TASKS: + case RCU_GP_TASKS | RCU_EXP_TASKS: /* * Blocking neither GP, or first task blocking the normal @@ -198,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) case RCU_EXP_BLKD: case RCU_GP_BLKD: - case RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: /* * First task arriving that blocks either GP, or first task @@ -214,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break; - case RCU_EXP_TASKS + RCU_EXP_BLKD: - case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD: /* * Second or subsequent task blocking the expedited GP. @@ -227,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add(&t->rcu_node_entry, rnp->exp_tasks); break; - case RCU_GP_TASKS + RCU_GP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD: /* * Second or subsequent task blocking the normal GP. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4432db6d0b99..925fcdad5dea 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); -/** - * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? - * - * Returns @true if the current grace period is sufficiently old that - * it is reasonable to assume that it might be stalled. This can be - * useful when deciding whether to allocate memory to enable RCU-mediated - * freeing on the one hand or just invoking synchronize_rcu() on the other. - * The latter is preferable when the grace period is stalled. - * - * Note that sampling of the .gp_start and .gp_seq fields must be done - * carefully to avoid false positives at the beginnings and ends of - * grace periods. - */ -bool rcu_gp_might_be_stalled(void) -{ - unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; - unsigned long j = jiffies; - - if (d < RCU_STALL_MIGHT_MIN) - d = RCU_STALL_MIGHT_MIN; - smp_mb(); // jiffies before .gp_seq to avoid false positives. - if (!rcu_gp_in_progress()) - return false; - // Long delays at this point avoids false positive, but a delay - // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. - smp_mb(); // .gp_seq before second .gp_start - // And ditto here. - return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); -} - /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -365,7 +335,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(void) +static void rcu_dump_cpu_stacks(unsigned long gp_seq) { int cpu; unsigned long flags; @@ -373,15 +343,23 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { printk_deferred_enter(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); else dump_cpu_task(cpu); } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } printk_deferred_exit(); } } @@ -638,7 +616,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); /* Complain about tasks blocking the grace period. */ rcu_for_each_leaf_node(rnp) @@ -670,7 +648,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(unsigned long gps) +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -707,7 +685,7 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ @@ -789,7 +767,8 @@ static void check_cpu_stall(struct rcu_data *rdp) gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; @@ -810,7 +789,7 @@ static void check_cpu_stall(struct rcu_data *rdp) pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); } else if (self_detected) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + print_cpu_stall(gs2, gps); } else { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); diff --git a/kernel/resource.c b/kernel/resource.c index b730bd28b422..4101016e8b20 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, rams_size += 16; } - rams[i].start = res.start; - rams[i++].end = res.end; - + rams[i++] = res; start = res.end + 1; } diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 42d2d8d20f5d..b8ef75b99eb2 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -169,6 +169,8 @@ static void resource_test_intersection(struct kunit *test) #define RES_TEST_RAM3_SIZE SZ_1M #define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); + static void remove_free_resource(void *ctx) { struct resource *res = (struct resource *)ctx; @@ -177,6 +179,14 @@ static void remove_free_resource(void *ctx) kfree(res); } +static void resource_test_add_action_or_abort( + struct kunit *test, void (*action)(void *), void *ctx) +{ + KUNIT_ASSERT_EQ_MSG(test, 0, + kunit_add_action_or_reset(test, action, ctx), + "Fail to add action"); +} + static void resource_test_request_region(struct kunit *test, struct resource *parent, resource_size_t start, resource_size_t size, const char *name, unsigned long flags) @@ -185,7 +195,7 @@ static void resource_test_request_region(struct kunit *test, struct resource *pa res = __request_region(parent, start, size, name, flags); KUNIT_ASSERT_NOT_NULL(test, res); - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_insert_resource(struct kunit *test, struct resource *parent, @@ -202,11 +212,11 @@ static void resource_test_insert_resource(struct kunit *test, struct resource *p res->end = start + size - 1; res->flags = flags; if (insert_resource(parent, res)) { - kfree(res); + resource_test_add_action_or_abort(test, kfree_wrapper, res); KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); } - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_region_intersects(struct kunit *test) @@ -220,7 +230,7 @@ static void resource_test_region_intersects(struct kunit *test) "test resources"); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); start = parent->start; - kunit_add_action_or_reset(test, remove_free_resource, parent); + resource_test_add_action_or_abort(test, remove_free_resource, parent); resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 44e83a646264..d86d2d9c4624 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -97,6 +97,7 @@ struct scf_statistics { static struct scf_statistics *scf_stats_p; static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); +static DEFINE_PER_CPU(struct llist_head, scf_free_pool); // Data for random primitive selection #define SCF_PRIM_RESCHED 0 @@ -133,6 +134,7 @@ struct scf_check { bool scfc_wait; bool scfc_rpc; struct completion scfc_completion; + struct llist_node scf_node; }; // Use to wait for all threads to start. @@ -148,6 +150,33 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); extern void resched_cpu(int cpu); // An alternative IPI vector. +static void scf_add_to_free_list(struct scf_check *scfcp) +{ + struct llist_head *pool; + unsigned int cpu; + + if (!scfcp) + return; + cpu = raw_smp_processor_id() % nthreads; + pool = &per_cpu(scf_free_pool, cpu); + llist_add(&scfcp->scf_node, pool); +} + +static void scf_cleanup_free_list(unsigned int cpu) +{ + struct llist_head *pool; + struct llist_node *node; + struct scf_check *scfcp; + + pool = &per_cpu(scf_free_pool, cpu); + node = llist_del_all(pool); + while (node) { + scfcp = llist_entry(node, struct scf_check, scf_node); + node = node->next; + kfree(scfcp); + } +} + // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) { @@ -296,7 +325,7 @@ out: if (scfcp->scfc_rpc) complete(&scfcp->scfc_completion); } else { - kfree(scfcp); + scf_add_to_free_list(scfcp); } } @@ -320,10 +349,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); - if (use_cpus_read_lock) - cpus_read_lock(); - else - preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); if (!scfcp) { @@ -337,6 +362,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_rpc = false; } } + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); switch (scfsp->scfs_prim) { case SCF_PRIM_RESCHED: if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) { @@ -363,7 +392,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single_wait_ofl++; else scfp->n_single_ofl++; - kfree(scfcp); + scf_add_to_free_list(scfcp); scfcp = NULL; } break; @@ -391,7 +420,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); } else { scfp->n_single_rpc_ofl++; - kfree(scfcp); + scf_add_to_free_list(scfcp); scfcp = NULL; } break; @@ -428,7 +457,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); atomic_inc(&n_mb_out_errs); // Leak rather than trash! } else { - kfree(scfcp); + scf_add_to_free_list(scfcp); } barrier(); // Prevent race-reduction compiler optimizations. } @@ -463,7 +492,7 @@ static int scftorture_invoker(void *arg) // Make sure that the CPU is affinitized appropriately during testing. curcpu = raw_smp_processor_id(); - WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, + WARN_ONCE(curcpu != cpu, "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", __func__, scfp->cpu, curcpu, nr_cpu_ids); @@ -479,6 +508,8 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); do { + scf_cleanup_free_list(cpu); + scftorture_invoke_one(scfp, &rand); while (cpu_is_offline(cpu) && !torture_must_stop()) { schedule_timeout_interruptible(HZ / 5); @@ -523,12 +554,15 @@ static void scf_torture_cleanup(void) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else goto end; - smp_call_function(scf_cleanup_handler, NULL, 0); + smp_call_function(scf_cleanup_handler, NULL, 1); torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); scf_torture_stats_print(); // -After- the stats thread is stopped! kfree(scf_stats_p); // -After- the last stats print has completed! scf_stats_p = NULL; + for (i = 0; i < nr_cpu_ids; i++) + scf_cleanup_free_list(i); + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) scftorture_print_module_parms("End of test: FAILURE"); else if (torture_onoff_failures()) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..95e40895a519 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -827,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -936,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -964,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1071,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -1187,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -1394,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -2012,11 +2058,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2065,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) + sched_info_enqueue(rq, p); + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2039,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); - } + + psi_dequeue(p, flags); /* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' @@ -2130,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p, void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + struct task_struct *donor = rq->donor; + + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } @@ -2615,9 +2663,7 @@ int push_cpu_stop(void *arg) // XXX validate p is still the highest prio task if (task_rq(p) == rq) { - deactivate_task(rq, p, 0); - set_task_cpu(p, lowest_rq->cpu); - activate_task(lowest_rq, p, 0); + move_queued_task_locked(rq, lowest_rq, p); resched_curr(lowest_rq); } @@ -2677,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* @@ -2691,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -3303,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -3518,14 +3563,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); - else + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3706,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else @@ -4120,6 +4169,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4303,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4317,9 +4368,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -4412,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -4699,7 +4752,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - } else if (task_should_scx(p)) { + } else if (task_should_scx(p->policy)) { p->sched_class = &ext_sched_class; #endif } else { @@ -4793,6 +4846,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4861,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4869,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5504,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -5572,7 +5626,8 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5583,19 +5638,23 @@ void sched_tick(void) sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; - curr = rq->curr; - psi_account_irqtime(rq, curr, NULL); + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); - curr->sched_class->task_tick(rq, curr, 0); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, curr); + task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5605,8 +5664,8 @@ void sched_tick(void) perf_event_task_tick(); - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); #ifdef CONFIG_SMP if (!scx_switched_all()) { @@ -5673,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + SCHED_WARN_ON(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5907,12 +5972,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, #ifdef CONFIG_SCHED_CLASS_EXT /* - * SCX requires a balance() call before every pick_next_task() including - * when waking up from SCHED_IDLE. If @start_class is below SCX, start - * from SCX instead. + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. */ - if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } #endif /* @@ -6293,10 +6361,7 @@ static bool try_steal_cookie(int this, int that) if (sched_task_is_throttled(p, this)) goto next; - deactivate_task(src, p, 0); - set_task_cpu(p, this); - activate_task(dst, p, 0); - + move_queued_task_locked(src, dst, p); resched_curr(dst); success = true; @@ -6491,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #define SM_RTLOCK_WAIT 2 /* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long task_state) +{ + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; +} + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -6537,6 +6641,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6597,36 +6702,12 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { - int flags = DEQUEUE_NOCLOCK; - - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(prev_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, prev, flags); - } + block = try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -6667,7 +6748,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); @@ -7010,20 +7091,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(policy)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7069,7 +7150,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7127,8 +7208,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -7164,7 +7250,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) @@ -7326,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -7333,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -7340,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -7347,30 +7446,41 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif @@ -7390,6 +7500,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: @@ -7399,6 +7510,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; @@ -7410,6 +7522,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; @@ -7421,9 +7534,22 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -7486,6 +7612,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -7506,6 +7634,7 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ @@ -7658,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -7674,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -7696,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP @@ -7785,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -8521,6 +8647,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; @@ -8935,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -10228,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -10240,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -10257,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allocation closest to 0 in cases where few threads migrate around * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is * greater or equal to the number of allowed CPUs, because user-space @@ -10267,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -10280,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -10458,7 +10588,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) @@ -10523,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c6ba15388ea7..28c77904ea74 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - sugov_eas_rebuild_sd(); - out: + sugov_eas_rebuild_sd(); mutex_unlock(&global_tunables_lock); return 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..d9d5a702f1a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2380,12 +2381,12 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } @@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2513,7 +2506,7 @@ next_node: if (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); @@ -2707,8 +2700,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2751,9 +2744,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f4035c7a0fa1..a48b2a701ec2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { static const char * preempt_modes[] = { - "none", "voluntary", "full" + "none", "voluntary", "full", "lazy", }; - int i; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c09e3dc38c34..ecb88c528544 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -18,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -624,6 +630,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -691,6 +701,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ @@ -778,7 +789,6 @@ enum scx_tg_flags { }; enum scx_ops_enable_state { - SCX_OPS_PREPPING, SCX_OPS_ENABLING, SCX_OPS_ENABLED, SCX_OPS_DISABLING, @@ -786,7 +796,6 @@ enum scx_ops_enable_state { }; static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_PREPPING] = "prepping", [SCX_OPS_ENABLING] = "enabling", [SCX_OPS_ENABLED] = "enabled", [SCX_OPS_DISABLING] = "disabling", @@ -853,7 +862,9 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); +static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -925,8 +936,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* dispatch queues */ -static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { .key_len = 8, @@ -1029,6 +1047,16 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1252,86 +1280,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } } /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - return true; - } else { - return false; - } + spin_lock_irq(&scx_tasks_lock); } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) @@ -1352,14 +1398,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1637,7 +1683,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); raw_spin_lock(&dsq->lock); } } @@ -1803,21 +1849,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -{ - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -} - -static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -{ - lockdep_assert(rcu_read_lock_any_held()); - - if (dsq_id == SCX_DSQ_GLOBAL) - return &scx_dsq_global; - else - return find_user_dsq(dsq_id); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { @@ -1830,16 +1861,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return &scx_dsq_global; + return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; } - dsq = find_non_local_dsq(dsq_id); + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = find_global_dsq(p); + else + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return &scx_dsq_global; + return find_global_dsq(p); } return dsq; @@ -2011,7 +2046,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(&scx_dsq_global, p, enq_flags); + dispatch_enqueue(find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2357,6 +2392,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } } #else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2396,6 +2432,13 @@ retry: return false; } +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + /** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @rq: current rq which is locked @@ -2429,7 +2472,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, #ifdef CONFIG_SMP if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2590,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); if (static_branch_unlikely(&scx_ops_cpu_preempt) && unlikely(rq->scx.cpu_released)) { @@ -2601,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in scx_next_task_picked(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2629,7 +2673,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) @@ -2654,7 +2698,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; /* @@ -2904,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; /* - * If balance_scx() is telling us to keep running @prev, replenish slice - * if necessary and keep running @prev. Otherwise, pop the first one - * from the local DSQ. - * * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just @@ -2918,27 +2961,46 @@ static struct task_struct *pick_task_scx(struct rq *rq) * which then ends up calling pick_task_scx() without preceding * balance_scx(). * - * For now, ignore cases where $prev is not on SCX. This isn't great and - * can theoretically lead to stalls. However, for switch_all cases, this - * happens only while a BPF scheduler is being loaded or unloaded, and, - * for partial cases, fair will likely keep triggering this CPU. + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. * - * Once fair is fixed, restore WARN_ON_ONCE(). + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). */ - if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && - prev->sched_class == &ext_sched_class) { + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev_on_scx) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && !prev_on_scx)) { + /* only allowed during transitions */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (keep_prev) { p = prev; if (!p->scx.slice) p->scx.slice = SCX_SLICE_DFL; } else { p = first_local_task(rq); - if (!p) + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL; + } if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; @@ -3043,11 +3105,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } - /* * If WAKE_SYNC, the waker's local DSQ is empty, and the system is * under utilized, wake up @p to the local DSQ of the waker. Checking @@ -3058,22 +3115,13 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * there is an idle core elsewhere on the system. */ cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + if ((wake_flags & SCX_WAKE_SYNC) && !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0) { if (cpumask_test_cpu(cpu, p->cpus_ptr)) goto cpu_found; } - if (p->nr_cpus_allowed == 1) { - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } else { - return prev_cpu; - } - } - /* * If CPU has SMT, any wholly idle CPU is likely a better pick than * partially idle @prev_cpu. @@ -3121,7 +3169,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3186,7 +3234,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -3519,12 +3567,7 @@ static void scx_ops_exit_task(struct task_struct *p) void init_scx_entity(struct sched_ext_entity *scx) { - /* - * init_idle() calls this function again after fork sequence is - * complete. Don't touch ->tasks_node as it's already linked. - */ - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - + memset(scx, 0, sizeof(*scx)); INIT_LIST_HEAD(&scx->dsq_list.node); RB_CLEAR_NODE(&scx->dsq_priq); scx->sticky_cpu = -1; @@ -3550,7 +3593,7 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_enabled()) + if (scx_ops_init_task_enabled) return scx_ops_init_task(p, task_group(p), true); else return 0; @@ -3558,7 +3601,7 @@ int scx_fork(struct task_struct *p) void scx_post_fork(struct task_struct *p) { - if (scx_enabled()) { + if (scx_ops_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -3690,6 +3733,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; static bool cgroup_warned_missing_weight; static bool cgroup_warned_missing_idle; @@ -3709,8 +3753,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg) static void scx_cgroup_warn_missing_idle(struct task_group *tg) { - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_idle) + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) return; if (!tg->idle) @@ -3731,15 +3774,18 @@ int scx_tg_online(struct task_group *tg) scx_cgroup_warn_missing_weight(tg); - if (SCX_HAS_OP(cgroup_init)) { - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); - if (!ret) + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; - else - ret = ops_sanitize_err("cgroup_init", ret); } else { tg->scx_flags |= SCX_TG_ONLINE; } @@ -3770,7 +3816,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) /* released in scx_finish/cancel_attach() */ percpu_down_read(&scx_cgroup_rwsem); - if (!scx_enabled()) + if (!scx_cgroup_enabled) return 0; cgroup_taskset_for_each(p, css, tset) { @@ -3813,7 +3859,7 @@ err: void scx_move_task(struct task_struct *p) { - if (!scx_enabled()) + if (!scx_cgroup_enabled) return; /* @@ -3849,7 +3895,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct task_struct *p; - if (!scx_enabled()) + if (!scx_cgroup_enabled) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { @@ -3866,7 +3912,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (tg->scx_weight != weight) { + if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, tg_cgrp(tg), weight); @@ -4038,6 +4084,8 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); + scx_cgroup_enabled = false; + /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited. @@ -4104,6 +4152,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -4113,6 +4162,9 @@ static int scx_cgroup_init(void) } rcu_read_unlock(); + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + return 0; } @@ -4218,14 +4270,14 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; } /** @@ -4240,36 +4292,40 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. * - * b. ops.dispatch() is ignored. + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - ops.dispatch() is ignored. * - * d. pick_next_task() suppresses zero slice warning. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - pick_next_task() suppresses zero slice warning. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { - int depth, cpu; + int cpu; + unsigned long flags; + raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { - depth = atomic_inc_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth <= 0); - if (depth != 1) - return; + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; } else { - depth = atomic_dec_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth < 0); - if (depth != 0) - return; + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; } /* @@ -4286,7 +4342,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; - rq_lock_irqsave(rq, &rf); + rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4322,11 +4378,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } +unlock: + raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4431,27 +4489,34 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); /* * The BPF scheduler is going away. All tasks including %TASK_DEAD ones * must be switched out and exited synchronously. */ + percpu_down_write(&scx_fork_rwsem); + + scx_ops_init_task_enabled = false; + + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); - __setscheduler_prio(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -4459,25 +4524,19 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); @@ -4929,11 +4988,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, ret; + int i, cpu, node, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } @@ -4948,6 +5007,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ret = -EBUSY; goto err_unlock; @@ -4971,12 +5058,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } /* - * Set scx_ops, transition to PREPPING and clear exit info to arm the + * Set scx_ops, transition to ENABLING and clear exit info to arm the * disable path. Failure triggers full disabling from here on. */ scx_ops = *ops; - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != SCX_OPS_DISABLED); atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -4997,7 +5084,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { ret = ops_sanitize_err("init", ret); - goto err_disable_unlock_cpus; + cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); + goto err_disable; } } @@ -5005,6 +5094,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); + check_hotplug_seq(ops); cpus_read_unlock(); ret = validate_ops(ops); @@ -5032,57 +5122,40 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Lock out forks, cgroup on/offlining and moves before opening the - * floodgate so that they don't wander into the operations prematurely. - * - * We don't need to keep the CPUs stable but static_branch_*() requires - * cpus_read_lock() and scx_cgroup_rwsem must nest inside - * cpu_hotplug_lock because of the following dependency chain: - * - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem - * - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use - * static_branch_*_cpuslocked(). - * - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the - * following dependency chain: - * - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); - - check_hotplug_seq(ops); + scx_ops_bypass(true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + static_branch_enable(&scx_has_op[i]); if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable_cpuslocked(&scx_ops_enq_last); + static_branch_enable(&scx_ops_enq_last); if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + static_branch_enable(&scx_ops_enq_exiting); if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_enable(&scx_builtin_idle_enabled); } else { - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_enabled); } /* - * All cgroups should be initialized before letting in tasks. cgroup - * on/offlining and task migrations are already locked out. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. */ - ret = scx_cgroup_init(); - if (ret) - goto err_disable_unlock_all; + percpu_down_write(&scx_fork_rwsem); - static_branch_enable_cpuslocked(&__scx_ops_enabled); + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5090,10 +5163,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. */ - spin_lock_irq(&scx_tasks_lock); + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5103,84 +5185,67 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_READY); + put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* - * All tasks are prepped but are still ops-disabled. Ensure that - * %current can't be scheduled out and switch everyone. - * preempt_disable() is necessary because we can't guarantee that - * %current won't be starved if scheduled out while switching. + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. */ - preempt_disable(); - - /* - * From here on, the disable path must assume that tasks have ops - * enabled and need to be recovered. - * - * Transition to ENABLING fails iff the BPF scheduler has already - * triggered scx_bpf_error(). Returning an error code here would lose - * the recorded error information. Exit indicating success so that the - * error is notified through ops.exit() with all the details. - */ - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { - preempt_enable(); - spin_unlock_irq(&scx_tasks_lock); - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; - goto err_disable_unlock_all; - } + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable(&__scx_ops_enabled); /* - * We're fully committed and can't fail. The PREPPED -> ENABLED + * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - - scx_task_iter_init(&sti); + percpu_down_write(&scx_fork_rwsem); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - scx_set_task_state(p, SCX_TASK_READY); - __setscheduler_prio(p, p->prio); + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - - spin_unlock_irq(&scx_tasks_lock); - preempt_enable(); - scx_cgroup_unlock(); - cpus_read_unlock(); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - /* see above ENABLING transition for the explanation on exiting with 0 */ + scx_ops_bypass(false); + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5212,14 +5277,21 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); -err_disable_unlock_cpus: - cpus_read_unlock(); + scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } @@ -5782,7 +5854,6 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); #ifdef CONFIG_SMP BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -5840,16 +5911,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); @@ -6058,7 +6134,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = &scx_dsq_global; + dst_dsq = find_global_dsq(p); dst_rq = src_rq; } } else { @@ -6175,7 +6251,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) flush_dispatch_buf(dspc->rq); - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); return false; @@ -6496,7 +6572,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) goto out; } } else { - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6545,7 +6621,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_non_local_dsq(dsq_id); + kit->dsq = find_user_dsq(dsq_id); if (!kit->dsq) return -ENOENT; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 246019519231..b1675bb59fc4 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq); void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); -bool task_should_scx(struct task_struct *p); +bool task_should_scx(int policy); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..fbdca89c677f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq, */ s64 update_curr_common(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - delta_exec = update_curr_se(rq, &curr->se); + delta_exec = update_curr_se(rq, &donor->se); if (likely(delta_exec > 0)) - update_curr_task(curr, delta_exec); + update_curr_task(donor, delta_exec); return delta_exec; } @@ -1247,18 +1247,18 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { - resched_curr(rq); + resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void @@ -3369,7 +3369,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3491,7 +3491,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -5677,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif } @@ -6058,10 +6053,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -6818,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -6833,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -7173,7 +7171,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7190,12 +7192,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } @@ -8753,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se) */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8784,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; if (!sched_feat(WAKEUP_PREEMPTION)) @@ -8832,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } static struct task_struct *pick_task_fair(struct rq *rq) @@ -13083,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -13174,22 +13178,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) @@ -13206,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); else wakeup_preempt(rq, p, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 290874079f60..a3d331dd2d8f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) */ SCHED_FEAT(RUN_TO_PARITY, true) /* - * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. */ SCHED_FEAT(PREEMPT_SHORT, true) @@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) -SCHED_FEAT(DOUBLE_TICK, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 631e42802925..621696269584 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -271,7 +271,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); /* * Interrupts shouldn't be re-enabled from that point on until diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a9c65d97b3ca..fc07382361a8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) bool update_other_load_avgs(struct rq *rq) { u64 now = rq_clock_pelt(rq); - const struct sched_class *curr_class = rq->curr->sched_class; + const struct sched_class *curr_class = rq->donor->sched_class; unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); lockdep_assert_rq_held(rq); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 020d58967d4e..84dad1511d1e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; + u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); @@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } @@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now, irq; s64 delta; + u64 irq; if (static_branch_likely(&psi_disabled)) return; @@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st if (prev && task_psi_group(prev) == group) return; - now = cpu_clock(cpu); irq = irq_time_read(cpu); delta = (s64)(irq - rq->psi_irq_time); if (delta < 0) @@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st rq->psi_irq_time = irq; do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 172c588de542..bd66a46b06ac 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; delta_exec = update_curr_common(rq); @@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq) return; #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity *rt_se = &curr->rt; + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; @@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task); static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; @@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1606,12 +1607,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) */ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); @@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); @@ -2000,7 +1991,7 @@ retry: * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } @@ -2021,7 +2012,7 @@ retry: * Note that the stoppers are masqueraded as SCHED_FIFO * (cf. sched_set_stop_task()), so we can't rely on rt_task(). */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) return 0; cpu = find_lowest_rq(rq->curr); @@ -2088,9 +2079,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); ret = 1; @@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq) * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); resched = true; } /* @@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..76f5f53a645f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -751,8 +751,9 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 3, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -1147,7 +1148,10 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1344,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -2085,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) #endif /* CONFIG_SMP */ -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ - -static inline void sched_core_account_forceidle(struct rq *rq) { } - -static inline void sched_core_tick(struct rq *rq) { } - -#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2260,11 +2241,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2292,6 +2287,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2334,6 +2330,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2360,6 +2357,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) @@ -2448,7 +2446,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); prev->sched_class->put_prev_task(rq, prev, NULL); } @@ -2612,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node) static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2692,6 +2690,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @@ -2766,8 +2765,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2775,6 +2772,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3166,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { @@ -3596,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid = __this_cpu_read(pcpu_cid->recent_cid); - cpumask = mm_cidmask(mm); + /* Try to re-use recent cid. This improves cache locality. */ + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = atomic_read(&mm->max_nr_cid); + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; return cid; @@ -3631,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3641,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3667,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3684,7 +3759,8 @@ end: return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3701,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); return cid; } @@ -3755,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } #else /* !CONFIG_SCHED_MM_CID: */ @@ -3768,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif #ifdef CONFIG_RT_MUTEXES @@ -3797,7 +3896,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(int policy, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..8ee0add5a48a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,44 +119,71 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, int flags) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) + if (flags & DEQUEUE_SLEEP) return; + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ psi_task_change(p, p->psi_flags, 0); } @@ -190,8 +217,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..0d71fcbaf1e3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,18 +706,23 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); @@ -1076,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. @@ -1159,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #endif } - return sched_attr_copy_to_user(uattr, &kattr, usize); + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } #ifdef CONFIG_SMP diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 134d7112ef71..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -9,7 +9,7 @@ static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hash-table's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. + * + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. + * + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; diff --git a/kernel/signal.c b/kernel/signal.c index 10b464b9d91f..98b65cb35830 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -414,7 +414,8 @@ static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, + override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; @@ -4006,7 +4007,6 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { int ret; - struct fd f; struct pid *pid; kernel_siginfo_t kinfo; enum pid_type type; @@ -4019,20 +4019,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - f = fdget(pidfd); - if (!fd_file(f)) + CLASS(fd, f)(pidfd); + if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) { - ret = PTR_ERR(pid); - goto err; - } + if (IS_ERR(pid)) + return PTR_ERR(pid); - ret = -EINVAL; if (!access_pidfd_pidns(pid)) - goto err; + return -EINVAL; switch (flags) { case 0: @@ -4056,28 +4053,23 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) - goto err; + return ret; - ret = -EINVAL; if (unlikely(sig != kinfo.si_signo)) - goto err; + return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ - ret = -EPERM; if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - goto err; + return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) - ret = kill_pgrp_info(sig, &kinfo, pid); + return kill_pgrp_info(sig, &kinfo, pid); else - ret = kill_pid_info_type(sig, &kinfo, pid, type); -err: - fdput(f); - return ret; + return kill_pid_info_type(sig, &kinfo, pid, type); } static int diff --git a/kernel/smp.c b/kernel/smp.c index f25e20617b7e..27dc31a146a3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -246,7 +246,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in return true; } - ts2 = sched_clock(); + ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * @@ -321,7 +321,7 @@ static void __csd_lock_wait(call_single_data_t *csd) int bug_id = 0; u64 ts0, ts1; - ts1 = ts0 = sched_clock(); + ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; diff --git a/kernel/softirq.c b/kernel/softirq.c index d082e7840f88..8b41bd13cc3d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -624,6 +624,24 @@ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_IRQ_FORCED_THREADING +DEFINE_PER_CPU(struct task_struct *, ktimerd); +DEFINE_PER_CPU(unsigned long, pending_timer_softirq); + +static void wake_timersd(void) +{ + struct task_struct *tsk = __this_cpu_read(ktimerd); + + if (tsk) + wake_up_process(tsk); +} + +#else + +static inline void wake_timersd(void) { } + +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -636,6 +654,10 @@ static inline void __irq_exit_rcu(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() && + local_timers_pending_force_th() && !(in_nmi() | in_hardirq())) + wake_timersd(); + tick_irq_exit(); } @@ -748,10 +770,8 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); static bool tasklet_clear_sched(struct tasklet_struct *t) { - if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { - wake_up_var(&t->state); + if (test_and_clear_wake_up_bit(TASKLET_STATE_SCHED, &t->state)) return true; - } WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n", t->use_callback ? "callback" : "func", @@ -871,8 +891,7 @@ void tasklet_kill(struct tasklet_struct *t) if (in_interrupt()) pr_notice("Attempt to kill tasklet from interrupt\n"); - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) - wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); + wait_on_bit_lock(&t->state, TASKLET_STATE_SCHED, TASK_UNINTERRUPTIBLE); tasklet_unlock_wait(t); tasklet_clear_sched(t); @@ -882,16 +901,13 @@ EXPORT_SYMBOL(tasklet_kill); #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) void tasklet_unlock(struct tasklet_struct *t) { - smp_mb__before_atomic(); - clear_bit(TASKLET_STATE_RUN, &t->state); - smp_mb__after_atomic(); - wake_up_var(&t->state); + clear_and_wake_up_bit(TASKLET_STATE_RUN, &t->state); } EXPORT_SYMBOL_GPL(tasklet_unlock); void tasklet_unlock_wait(struct tasklet_struct *t) { - wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); + wait_on_bit(&t->state, TASKLET_STATE_RUN, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(tasklet_unlock_wait); #endif @@ -971,12 +987,57 @@ static struct smp_hotplug_thread softirq_threads = { .thread_comm = "ksoftirqd/%u", }; +#ifdef CONFIG_IRQ_FORCED_THREADING +static void ktimerd_setup(unsigned int cpu) +{ + /* Above SCHED_NORMAL to handle timers before regular tasks. */ + sched_set_fifo_low(current); +} + +static int ktimerd_should_run(unsigned int cpu) +{ + return local_timers_pending_force_th(); +} + +void raise_ktimers_thread(unsigned int nr) +{ + trace_softirq_raise(nr); + __this_cpu_or(pending_timer_softirq, BIT(nr)); +} + +static void run_ktimerd(unsigned int cpu) +{ + unsigned int timer_si; + + ksoftirqd_run_begin(); + + timer_si = local_timers_pending_force_th(); + __this_cpu_write(pending_timer_softirq, 0); + or_softirq_pending(timer_si); + + __do_softirq(); + + ksoftirqd_run_end(); +} + +static struct smp_hotplug_thread timer_thread = { + .store = &ktimerd, + .setup = ktimerd_setup, + .thread_should_run = ktimerd_should_run, + .thread_fn = run_ktimerd, + .thread_comm = "ktimers/%u", +}; +#endif + static __init int spawn_ksoftirqd(void) { cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); - +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) + BUG_ON(smpboot_register_percpu_thread(&timer_thread)); +#endif return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..c4c701c6f0b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1911,12 +1911,11 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct fd exe; + CLASS(fd, exe)(fd); struct inode *inode; int err; - exe = fdget(fd); - if (!fd_file(exe)) + if (fd_empty(exe)) return -EBADF; inode = file_inode(fd_file(exe)); @@ -1926,18 +1925,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * sure that this one is executable as well, to avoid breaking an * overall picture. */ - err = -EACCES; if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) - goto exit; + return -EACCES; err = file_permission(fd_file(exe), MAY_EXEC); if (err) - goto exit; + return err; - err = replace_mm_exe_file(mm, fd_file(exe)); -exit: - fdput(exe); - return err; + return replace_mm_exe_file(mm, fd_file(exe)); } /* @@ -2324,6 +2319,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2784,6 +2794,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_shadow_stack_status(me, arg2); + break; + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_shadow_stack_status(me, arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..c969f1f26be5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + /* + * Record the work call stack in order to print it in KASAN + * reports. + * + * Note that stack allocation can fail if TWAF_NO_ALLOC flag + * is set and new page is needed to expand the stack buffer. + */ + if (flags & TWAF_NO_ALLOC) + kasan_record_aux_stack_noalloc(work); + else + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0700f40c53ac..0cd680ccc7e5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -411,15 +411,14 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct nlattr *na; size_t size; u32 fd; - struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -427,14 +426,13 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) - goto err; + return rc; na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { nlmsg_free(rep_skb); - rc = -EMSGSIZE; - goto err; + return -EMSGSIZE; } stats = nla_data(na); @@ -443,14 +441,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); - goto err; + return rc; } - rc = send_reply(rep_skb, info); - -err: - fdput(f); - return rc; + return send_reply(rep_skb, info); } static int cmd_attr_register_cpumask(struct genl_info *info) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 55e9ffbcd49a..80fe3749d2db 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1859,7 +1859,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1954,7 +1954,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c2f3d0c490d5..1af0bb2cc45c 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 68d6c1190ac7..fcca4e72f1ef 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -71,16 +71,16 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount_latch(&cd.seq); + *seq = read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } notrace int sched_clock_read_retry(unsigned int seq) { - return raw_read_seqcount_latch_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long noinstr sched_clock_noinstr(void) +static __always_inline unsigned long long __sched_clock(void) { struct clock_read_data *rd; unsigned int seq; @@ -98,11 +98,23 @@ unsigned long long noinstr sched_clock_noinstr(void) return res; } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return __sched_clock(); +} + unsigned long long notrace sched_clock(void) { unsigned long long ns; preempt_disable_notrace(); - ns = sched_clock_noinstr(); + /* + * All of __sched_clock() is a seqcount_latch reader critical section, + * but relies on the raw helpers which are uninstrumented. For KCSAN, + * mark all accesses in __sched_clock() as atomic. + */ + kcsan_nestable_atomic_begin(); + ns = __sched_clock(); + kcsan_nestable_atomic_end(); preempt_enable_notrace(); return ns; } @@ -119,17 +131,19 @@ unsigned long long notrace sched_clock(void) */ static void update_clock_read_data(struct clock_read_data *rd) { - /* update the backup (odd) copy with the new data */ - cd.read_data[1] = *rd; - /* steer readers towards the odd copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch_begin(&cd.seq); /* now its safe for us to update the normal (even) copy */ cd.read_data[0] = *rd; /* switch readers back to the even copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch(&cd.seq); + + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + write_seqcount_latch_end(&cd.seq); } /* @@ -267,7 +281,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount_latch(&cd.seq); + unsigned int seq = read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f90c7333b1d..fa058510af9c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -426,6 +426,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; @@ -851,7 +857,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); + return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d115adebc418..0ca85ff4fbb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -343,7 +343,7 @@ static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * Employ the latch technique; see @raw_write_seqcount_latch. + * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -356,16 +356,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); + + write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) @@ -375,11 +377,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount_latch(&tkf->seq); + seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); - } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a283e524835d..a5860bf6d16f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2499,7 +2499,7 @@ static void run_local_timers(void) */ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { - raise_softirq(TIMER_SOFTIRQ); + raise_timer_softirq(TIMER_SOFTIRQ); return; } } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 98488b20b594..05d383143165 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -119,7 +119,7 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); - __arch_update_vsyscall(vdata, tk); + __arch_update_vsyscall(vdata); vdso_write_end(vdata); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..f86c78961708 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1202,7 +1202,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u64), }; @@ -1219,7 +1219,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg2_size = sizeof(u64), }; @@ -2216,8 +2216,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); - if (ret == -ENOENT) - goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { @@ -3133,7 +3131,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; - long left; + char *p, *buf; + long left = 0; if (!upath ^ !upath_size) return -EINVAL; @@ -3147,26 +3146,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; - if (upath) { - char *p, *buf; - - upath_size = min_t(u32, upath_size, PATH_MAX); - - buf = kmalloc(upath_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - p = d_path(&umulti_link->path, buf, upath_size); - if (IS_ERR(p)) { - kfree(buf); - return PTR_ERR(p); - } - upath_size = buf + upath_size - p; - left = copy_to_user(upath, p, upath_size); + upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { kfree(buf); - if (left) - return -EFAULT; - info->uprobe_multi.path_size = upath_size; + return PTR_ERR(p); } + upath_size = buf + upath_size - p; + + if (upath) + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; @@ -3244,7 +3240,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) } static int -uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; @@ -3253,7 +3250,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) } static int -uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..69e226a48daa 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,14 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,13 +1237,33 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); + + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } if (!fgraph_array[0]) { /* The array must always have real data on it */ @@ -1258,10 +1273,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) } i = fgraph_lru_alloc_index(); - if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) { - ret = -ENOSPC; - goto out; - } + if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) + return -ENOSPC; gops->idx = i; ftrace_graph_active++; @@ -1298,8 +1311,6 @@ error: gops->saved_func = NULL; fgraph_lru_release_index(i); } -out: - mutex_unlock(&ftrace_lock); return ret; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..5807116bcd0b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6725,39 +6725,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6766,11 +6765,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c01375adc471..6a891e00aa7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf) ring_buffer_record_enable(buffer); } +static void tracing_reset_all_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); +} + /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { @@ -3697,8 +3716,8 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { - long text_delta = iter->tr->text_delta; - long data_delta = iter->tr->data_delta; + long text_delta = 0; + long data_delta = 0; const char *p = fmt; const char *str; bool good; @@ -3710,6 +3729,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, if (static_branch_unlikely(&trace_no_verify)) goto print; + /* + * When the kernel is booted with the tp_printk command line + * parameter, trace events go directly through to printk(). + * It also is checked by this function, but it does not + * have an associated trace_array (tr) for it. + */ + if (iter->tr) { + text_delta = iter->tr->text_delta; + data_delta = iter->tr->data_delta; + } + /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; @@ -5490,6 +5520,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" @@ -6130,8 +6164,13 @@ static void update_last_data(struct trace_array *tr) if (!tr->text_delta && !tr->data_delta) return; - /* Clear old data */ - tracing_reset_online_cpus(&tr->array_buffer); + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; @@ -10610,10 +10649,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b0e0ec85912e..ebda68ee9abf 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -912,6 +912,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } } + if (argc - 2 > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto error; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ret = trace_eprobe_tp_update_arg(ep, argv, i); if (ret) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index a079abd8955b..c62d1629cffe 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1187,6 +1187,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1203,7 +1207,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..3bd6071441ad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 61a6da808203..263fac44d3ca 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1013,6 +1013,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1029,7 +1033,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 1439064f65d6..a50ed23bee77 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1953,12 +1953,8 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - mutex_lock(&interface_lock); - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; - mutex_unlock(&interface_lock); - if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); @@ -1972,7 +1968,6 @@ static void stop_kthread(unsigned int cpu) put_task_struct(kthread); } } else { - mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1994,8 +1989,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - for_each_possible_cpu(cpu) + cpus_read_lock(); + + for_each_online_cpu(cpu) stop_kthread(cpu); + + cpus_read_unlock(); } /* @@ -2007,6 +2006,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2061,11 +2064,10 @@ static int start_per_cpu_kthreads(void) if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (!WARN_ON(!kthread)) kthread_stop(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } for_each_cpu(cpu, current_mask) { @@ -2095,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) mutex_lock(&interface_lock); cpus_read_lock(); + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 39877c80d6cb..16a5e368e7b7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -276,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; - } else if (len > MAX_EVENT_NAME_LEN) { + } else if (len >= MAX_EVENT_NAME_LEN) { trace_probe_log_err(offset, EVENT_TOO_LONG); return -EINVAL; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..1469dd8075fa 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c40531d2cbad..fed382b7881b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -89,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data); static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs); + unsigned long func, struct pt_regs *regs, + __u64 *data); #ifdef CONFIG_STACK_GROWSUP static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) @@ -565,6 +567,8 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; + if (argc - 2 > MAX_TRACE_ARGS) + return -E2BIG; if (argv[0][1] == ':') event = &argv[0][2]; @@ -690,7 +694,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tu->filename = filename; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { struct traceprobe_parse_context ctx = { .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, }; @@ -875,6 +879,7 @@ struct uprobe_cpu_buffer { }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -979,6 +984,11 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; + if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { + ucb->dsize = MAX_UCB_BUFFER_SIZE; + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + } + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); *ucbp = ucb; @@ -998,9 +1008,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; @@ -1517,7 +1524,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, } } -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; @@ -1548,7 +1556,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) } static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; diff --git a/kernel/ucount.c b/kernel/ucount.c index 8c07714ff27d..696406939be5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) - goto unwind; + goto dec_unwind; if (iter == ucounts) ret = new; - max = get_userns_rlimit_max(iter->ns, type); + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); -unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } diff --git a/kernel/umh.c b/kernel/umh.c index ff1f13a27d29..be9234270777 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,7 +13,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index d36242fd4936..1895fbc32bcb 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -663,16 +663,14 @@ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); - struct fd f; + CLASS(fd, f)(fd); - f = fdget(fd); - if (fd_file(f)) { + if (!fd_empty(f)) { pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } - fdput(f); } return wqueue; |
