From 42268ad0eb4142245ea40ab01a5690a40e9c3b41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Sep 2024 11:10:07 -1000 Subject: sched_ext: Build fix for !CONFIG_SMP move_remote_task_to_local_dsq() is only defined on SMP configs but scx_disaptch_from_dsq() was calling move_remote_task_to_local_dsq() on UP configs too causing build failures. Add a dummy move_remote_task_to_local_dsq() which triggers a warning. Signed-off-by: Tejun Heo Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409241108.jaocHiDJ-lkp@intel.com/ --- kernel/sched/ext.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c09e3dc38c34..d74d1fe06999 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2357,6 +2357,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } } #else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 63fb3ec80516b256e9fc91de48567f5eda61d135 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: sched_ext: Allow only user DSQs for scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new() SCX_DSQ_GLOBAL is special in that it can't be used as a priority queue and is consumed implicitly, but all BPF DSQ related kfuncs could be used on it. SCX_DSQ_GLOBAL will be split per-node for scalability and those operations won't make sense anymore. Disallow SCX_DSQ_GLOBAL on scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new(). This means that SCX_DSQ_GLOBAL can only be used as a dispatch target from BPF schedulers. With scx_flatcg, which was using SCX_DSQ_GLOBAL as the fallback DSQ, updated, this shouldn't affect any schedulers. This leaves find_dsq_for_dispatch() the only user of find_non_local_dsq(). Open code and remove find_non_local_dsq(). Signed-off-by: tejun heo Acked-by: David Vernet --- kernel/sched/ext.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d74d1fe06999..ed2b914c42d1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1808,16 +1808,6 @@ static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); } -static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -{ - lockdep_assert(rcu_read_lock_any_held()); - - if (dsq_id == SCX_DSQ_GLOBAL) - return &scx_dsq_global; - else - return find_user_dsq(dsq_id); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { @@ -1835,7 +1825,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, return &cpu_rq(cpu)->scx.local_dsq; } - dsq = find_non_local_dsq(dsq_id); + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = &scx_dsq_global; + else + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); @@ -6176,7 +6170,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) flush_dispatch_buf(dspc->rq); - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); return false; @@ -6497,7 +6491,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) goto out; } } else { - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6546,7 +6540,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_non_local_dsq(dsq_id); + kit->dsq = find_user_dsq(dsq_id); if (!kit->dsq) return -ENOENT; -- cgit v1.2.3 From bba26bf356d1c1314a7bb24041c64c5784febbb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: sched_ext: Relocate find_user_dsq() To prepare for the addition of find_global_dsq(). No functional changes. Signed-off-by: tejun heo Acked-by: David Vernet --- kernel/sched/ext.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ed2b914c42d1..acb4db7827a4 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1029,6 +1029,11 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1803,11 +1808,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -{ - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { -- cgit v1.2.3 From b7b3b2dbae73b412c2d24b3d0ebf1110991e4510 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: sched_ext: Split the global DSQ per NUMA node In the bypass mode, the global DSQ is used to schedule all tasks in simple FIFO order. All tasks are queued into the global DSQ and all CPUs try to execute tasks from it. This creates a lot of cross-node cacheline accesses and scheduling across the node boundaries, and can lead to live-lock conditions where the system takes tens of minutes to disable the BPF scheduler while executing in the bypass mode. Split the global DSQ per NUMA node. Each node has its own global DSQ. When a task is dispatched to SCX_DSQ_GLOBAL, it's put into the global DSQ local to the task's CPU and all CPUs in a node only consume its node-local global DSQ. This resolves a livelock condition which could be reliably triggered on an 2x EPYC 7642 system by running `stress-ng --race-sched 1024` together with `stress-ng --workload 80 --workload-threads 10` while repeatedly enabling and disabling a SCX scheduler. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 73 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 13 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index acb4db7827a4..949a3c43000a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -925,8 +925,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* dispatch queues */ -static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { .key_len = 8, @@ -1029,6 +1036,11 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) { return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); @@ -1642,7 +1654,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); raw_spin_lock(&dsq->lock); } } @@ -1820,20 +1832,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return &scx_dsq_global; + return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); else dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return &scx_dsq_global; + return find_global_dsq(p); } return dsq; @@ -2005,7 +2017,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(&scx_dsq_global, p, enq_flags); + dispatch_enqueue(find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2391,6 +2403,13 @@ retry: return false; } +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + /** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @rq: current rq which is locked @@ -2424,7 +2443,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, #ifdef CONFIG_SMP if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2624,7 +2644,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) @@ -2649,7 +2669,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; /* @@ -4924,7 +4944,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, ret; + int i, cpu, node, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -4943,6 +4963,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ret = -EBUSY; goto err_unlock; @@ -5777,7 +5825,6 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); #ifdef CONFIG_SMP BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -6053,7 +6100,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = &scx_dsq_global; + dst_dsq = find_global_dsq(p); dst_rq = src_rq; } } else { -- cgit v1.2.3 From 6f34d8d382d64e7d8e77f5a9ddfd06f4c04937b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: sched_ext: Use shorter slice while bypassing While bypassing, tasks are scheduled in FIFO order which favors tasks that hog CPUs. This can slow down e.g. unloading of the BPF scheduler. While bypassing, guaranteeing timely forward progress is the main goal. There's no point in giving long slices. Shorten the time slice used while bypassing from 20ms to 5ms. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 949a3c43000a..d6f6bf6caecc 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,7 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { + SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, @@ -1944,6 +1945,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { + bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; @@ -1961,7 +1963,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (bypassing) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2016,7 +2018,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } -- cgit v1.2.3 From 1bbcfe620e03aafc542b1c649dea0a9499a4490f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:39 -1000 Subject: sched_ext: Relocate check_hotplug_seq() call in scx_ops_enable() check_hotplug_seq() is used to detect CPU hotplug event which occurred while the BPF scheduler is being loaded so that initialization can be retried if CPU hotplug events take place before the CPU hotplug callbacks are online. As such, the best place to call it is in the same cpu_read_lock() section that enables the CPU hotplug ops. Currently, it is called in the next cpus_read_lock() block in scx_ops_enable(). The side effect of this placement is a small window in which hotplug sequence detection can trigger unnecessarily, which isn't critical. Move check_hotplug_seq() invocation to the same cpus_read_lock() block as the hotplug operation enablement to close the window and get the invocation out of the way for planned locking updates. Signed-off-by: Tejun Heo Cc: David Vernet --- kernel/sched/ext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d6f6bf6caecc..e8ab7e5ee2dd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5050,6 +5050,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); + check_hotplug_seq(ops); cpus_read_unlock(); ret = validate_ops(ops); @@ -5098,8 +5099,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) cpus_read_lock(); scx_cgroup_lock(); - check_hotplug_seq(ops); - for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); -- cgit v1.2.3 From fc1fcebead344360979ea9407029f9c8a99d718f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:39 -1000 Subject: sched_ext: Remove SCX_OPS_PREPPING The distinction between SCX_OPS_PREPPING and SCX_OPS_ENABLING is not used anywhere and only adds confusion. Drop SCX_OPS_PREPPING. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e8ab7e5ee2dd..daeb8446ff32 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -779,7 +779,6 @@ enum scx_tg_flags { }; enum scx_ops_enable_state { - SCX_OPS_PREPPING, SCX_OPS_ENABLING, SCX_OPS_ENABLED, SCX_OPS_DISABLING, @@ -787,7 +786,6 @@ enum scx_ops_enable_state { }; static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_PREPPING] = "prepping", [SCX_OPS_ENABLING] = "enabling", [SCX_OPS_ENABLED] = "enabled", [SCX_OPS_DISABLING] = "disabling", @@ -5016,12 +5014,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } /* - * Set scx_ops, transition to PREPPING and clear exit info to arm the + * Set scx_ops, transition to ENABLING and clear exit info to arm the * disable path. Failure triggers full disabling from here on. */ scx_ops = *ops; - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != SCX_OPS_DISABLED); atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -5174,23 +5172,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ preempt_disable(); - /* - * From here on, the disable path must assume that tasks have ops - * enabled and need to be recovered. - * - * Transition to ENABLING fails iff the BPF scheduler has already - * triggered scx_bpf_error(). Returning an error code here would lose - * the recorded error information. Exit indicating success so that the - * error is notified through ops.exit() with all the details. - */ - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { - preempt_enable(); - spin_unlock_irq(&scx_tasks_lock); - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; - goto err_disable_unlock_all; - } - /* * We're fully committed and can't fail. The PREPPED -> ENABLED * transitions here are synchronized against sched_ext_free() through @@ -5221,7 +5202,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); - /* see above ENABLING transition for the explanation on exiting with 0 */ + /* + * Returning an error code here would lose the recorded error + * information. Exit indicating success so that the error is notified + * through ops.exit() with all the details. + */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ret = 0; -- cgit v1.2.3 From 8c2090c504e998c8f34ec870bae71dafcc96a6e0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Initialize in bypass mode scx_ops_enable() used preempt_disable() around the task iteration loop to switch tasks into SCX to guarantee forward progress of the task which is running scx_ops_enable(). However, in the gap between setting __scx_ops_enabled and preeempt_disable(), an external entity can put tasks including the enabling one into SCX prematurely, which can lead to malfunctions including stalls. The bypass mode can wrap the entire enabling operation and guarantee forward progress no matter what the BPF scheduler does. Use the bypass mode instead to guarantee forward progress while enabling. While at it, release and regrab scx_tasks_lock between the two task iteration locks in scx_ops_enable() for clarity as there is no reason to keep holding the lock between them. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index daeb8446ff32..00883f3ef647 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5075,6 +5075,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) queue_delayed_work(system_unbound_wq, &scx_watchdog_work, scx_watchdog_timeout / 2); + /* + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. + */ + scx_ops_bypass(true); + /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. @@ -5134,7 +5142,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * disabled. */ spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* @@ -5163,22 +5170,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); } scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); /* - * All tasks are prepped but are still ops-disabled. Ensure that - * %current can't be scheduled out and switch everyone. - * preempt_disable() is necessary because we can't guarantee that - * %current won't be starved if scheduled out while switching. + * All tasks are prepped but the tasks are not enabled. Switch everyone. */ - preempt_disable(); + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); /* * We're fully committed and can't fail. The PREPPED -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; @@ -5195,12 +5199,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); - preempt_enable(); + scx_cgroup_unlock(); cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); /* * Returning an error code here would lose the recorded error @@ -5241,6 +5245,7 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); err_disable_unlock_cpus: cpus_read_unlock(); err_disable: -- cgit v1.2.3 From 9753358a6a2b011478e8efdabbb489216252426f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Fix SCX_TASK_INIT -> SCX_TASK_READY transitions in scx_ops_enable() scx_ops_enable() has two task iteration loops. The first one calls scx_ops_init_task() on every task and the latter switches the eligible ones into SCX. The first loop left the tasks in SCX_TASK_INIT state and then the second loop switched it into READY before switching the task into SCX. The distinction between INIT and READY is only meaningful in the fork path where it's used to tell whether the task finished forking so that we can tell ops.exit_task() accordingly. Leaving task in INIT state between the two loops is incosistent with the fork path and incorrect. The following can be triggered by running a program which keeps toggling a task between SCHED_OTHER and SCHED_SCX while enabling a task: sched_ext: Invalid task state transition 1 -> 3 for fish[1526] WARNING: CPU: 2 PID: 1615 at kernel/sched/ext.c:3393 scx_ops_enable_task+0x1a1/0x200 ... Sched_ext: qmap (enabling+all) RIP: 0010:scx_ops_enable_task+0x1a1/0x200 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x850/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by transitioning to READY in the first loop right after scx_ops_init_task() succeeds. Signed-off-by: Tejun Heo Cc: David Vernet --- kernel/sched/ext.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 00883f3ef647..e83af19de59d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5166,6 +5166,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_READY); + put_task_struct(p); spin_lock_irq(&scx_tasks_lock); } @@ -5178,7 +5180,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); /* - * We're fully committed and can't fail. The PREPPED -> ENABLED + * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ @@ -5190,7 +5192,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - scx_set_task_state(p, SCX_TASK_READY); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); -- cgit v1.2.3 From 4269c603cc26df154e0db303a9347e6ec3cc805e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Enable scx_ops_init_task() separately scx_ops_init_task() and the follow-up scx_ops_enable_task() in the fork path were gated by scx_enabled() test and thus __scx_ops_enabled had to be turned on before the first scx_ops_init_task() loop in scx_ops_enable(). However, if an external entity causes sched_class switch before the loop is complete, tasks which are not initialized could be switched to SCX. The following can be reproduced by running a program which keeps toggling a process between SCHED_OTHER and SCHED_EXT using sched_setscheduler(2). sched_ext: Invalid task state transition 0 -> 3 for fish[1623] WARNING: CPU: 1 PID: 1650 at kernel/sched/ext.c:3392 scx_ops_enable_task+0x1a1/0x200 ... Sched_ext: simple (enabling) RIP: 0010:scx_ops_enable_task+0x1a1/0x200 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x850/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by gating scx_ops_init_task() separately using scx_ops_init_task_enabled. __scx_ops_enabled is now set after all tasks are finished with scx_ops_init_task(). Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e83af19de59d..7729594882d9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -853,6 +853,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -3565,7 +3566,7 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_enabled()) + if (scx_ops_init_task_enabled) return scx_ops_init_task(p, task_group(p), true); else return 0; @@ -3573,7 +3574,7 @@ int scx_fork(struct task_struct *p) void scx_post_fork(struct task_struct *p) { - if (scx_enabled()) { + if (scx_ops_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -4453,6 +4454,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work) cpus_read_lock(); scx_cgroup_lock(); + scx_ops_init_task_enabled = false; + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); /* @@ -5132,7 +5135,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; - static_branch_enable_cpuslocked(&__scx_ops_enabled); + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5175,9 +5179,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_unlock_irq(&scx_tasks_lock); /* - * All tasks are prepped but the tasks are not enabled. Switch everyone. + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable_cpuslocked(&__scx_ops_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED -- cgit v1.2.3 From 568894edbe48f0878f787ed533dc9dbfd09c0fbe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online() If the BPF scheduler does not implement ops.cgroup_init(), scx_tg_online() didn't set SCX_TG_INITED which meant that ops.cgroup_exit(), even if implemented, won't be called from scx_tg_offline(). This is because SCX_HAS_OP(cgroupt_init) is used to test both whether SCX cgroup operations are enabled and ops.cgroup_init() exists. Fix it by introducing a separate bool scx_cgroup_enabled to gate cgroup operations and use SCX_HAS_OP(cgroup_init) only to test whether ops.cgroup_init() exists. Make all cgroup operations consistently use scx_cgroup_enabled to test whether cgroup operations are enabled. scx_cgroup_enabled is added instead of using scx_enabled() to ease planned locking updates. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7729594882d9..f9675ced75e8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3706,6 +3706,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; static bool cgroup_warned_missing_weight; static bool cgroup_warned_missing_idle; @@ -3725,8 +3726,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg) static void scx_cgroup_warn_missing_idle(struct task_group *tg) { - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_idle) + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) return; if (!tg->idle) @@ -3747,15 +3747,18 @@ int scx_tg_online(struct task_group *tg) scx_cgroup_warn_missing_weight(tg); - if (SCX_HAS_OP(cgroup_init)) { - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); - if (!ret) + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; - else - ret = ops_sanitize_err("cgroup_init", ret); } else { tg->scx_flags |= SCX_TG_ONLINE; } @@ -3786,7 +3789,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) /* released in scx_finish/cancel_attach() */ percpu_down_read(&scx_cgroup_rwsem); - if (!scx_enabled()) + if (!scx_cgroup_enabled) return 0; cgroup_taskset_for_each(p, css, tset) { @@ -3829,7 +3832,7 @@ err: void scx_move_task(struct task_struct *p) { - if (!scx_enabled()) + if (!scx_cgroup_enabled) return; /* @@ -3865,7 +3868,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct task_struct *p; - if (!scx_enabled()) + if (!scx_cgroup_enabled) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { @@ -3882,7 +3885,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (tg->scx_weight != weight) { + if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, tg_cgrp(tg), weight); @@ -4054,6 +4057,9 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); + WARN_ON_ONCE(!scx_cgroup_enabled); + scx_cgroup_enabled = false; + /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited. @@ -4129,6 +4135,9 @@ static int scx_cgroup_init(void) } rcu_read_unlock(); + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + return 0; } -- cgit v1.2.3 From 160216568cddc9d6e7f36133ba41d25459d90de4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Decouple locks in scx_ops_disable_workfn() The disable path uses three big locks - scx_fork_rwsem, scx_cgroup_rwsem and cpus_read_lock. Currently, the locks are grabbed together which is prone to locking order problems. With the preceding scx_cgroup_enabled change, we can decouple them: - As cgroup disabling no longer requires modifying a static_key which requires cpus_read_lock(), no need to grab cpus_read_lock() before grabbing scx_cgroup_rwsem. - cgroup can now be independently disabled before tasks are moved back to the fair class. Relocate scx_cgroup_exit() invocation before scx_fork_rwsem is grabbed, drop now unnecessary cpus_read_lock() and move static_key operations out of scx_fork_rwsem. This decouples all three locks in the disable path. Signed-off-by: Tejun Heo Reported-and-tested-by: Aboorva Devarajan Link: http://lkml.kernel.org/r/8cd0ec0c4c7c1bc0119e61fbef0bee9d5e24022d.camel@linux.ibm.com --- kernel/sched/ext.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f9675ced75e8..7d0ce7e6ea1f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4456,21 +4456,23 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); - scx_ops_init_task_enabled = false; - - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); /* * The BPF scheduler is going away. All tasks including %TASK_DEAD ones * must be switched out and exited synchronously. */ + percpu_down_write(&scx_fork_rwsem); + + scx_ops_init_task_enabled = false; + + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; @@ -4488,23 +4490,18 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); -- cgit v1.2.3 From efe231d9debf6db812bebb262407c95b21cdb8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: sched_ext: Decouple locks in scx_ops_enable() The enable path uses three big locks - scx_fork_rwsem, scx_cgroup_rwsem and cpus_read_lock. Currently, the locks are grabbed together which is prone to locking order problems. For example, currently, there is a possible deadlock involving scx_fork_rwsem and cpus_read_lock. cpus_read_lock has to nest inside scx_fork_rwsem due to locking order existing in other subsystems. However, there exists a dependency in the other direction during hotplug if hotplug needs to fork a new task, which happens in some cases. This leads to the following deadlock: scx_ops_enable() hotplug percpu_down_write(&cpu_hotplug_lock) percpu_down_write(&scx_fork_rwsem) block on cpu_hotplug_lock kthread_create() waits for kthreadd kthreadd blocks on scx_fork_rwsem Note that this doesn't trigger lockdep because the hotplug side dependency bounces through kthreadd. With the preceding scx_cgroup_enabled change, this can be solved by decoupling cpus_read_lock, which is needed for static_key manipulations, from the other two locks. - Move the first block of static_key manipulations outside of scx_fork_rwsem and scx_cgroup_rwsem. This is now safe with the preceding scx_cgroup_enabled change. - Drop scx_cgroup_rwsem and scx_fork_rwsem between the two task iteration blocks so that __scx_ops_enabled static_key enabling is outside the two rwsems. Signed-off-by: Tejun Heo Reported-and-tested-by: Aboorva Devarajan Link: http://lkml.kernel.org/r/8cd0ec0c4c7c1bc0119e61fbef0bee9d5e24022d.camel@linux.ibm.com --- kernel/sched/ext.c | 67 ++++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7d0ce7e6ea1f..0c398ecdb938 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5049,7 +5049,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { ret = ops_sanitize_err("init", ret); - goto err_disable_unlock_cpus; + cpus_read_unlock(); + goto err_disable; } } @@ -5092,54 +5093,30 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ scx_ops_bypass(true); - /* - * Lock out forks, cgroup on/offlining and moves before opening the - * floodgate so that they don't wander into the operations prematurely. - * - * We don't need to keep the CPUs stable but static_branch_*() requires - * cpus_read_lock() and scx_cgroup_rwsem must nest inside - * cpu_hotplug_lock because of the following dependency chain: - * - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem - * - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use - * static_branch_*_cpuslocked(). - * - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the - * following dependency chain: - * - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock - */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); - for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + static_branch_enable(&scx_has_op[i]); if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable_cpuslocked(&scx_ops_enq_last); + static_branch_enable(&scx_ops_enq_last); if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + static_branch_enable(&scx_ops_enq_exiting); if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_enable(&scx_builtin_idle_enabled); } else { - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_enabled); } /* - * All cgroups should be initialized before letting in tasks. cgroup - * on/offlining and task migrations are already locked out. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. */ - ret = scx_cgroup_init(); - if (ret) - goto err_disable_unlock_all; + percpu_down_write(&scx_fork_rwsem); WARN_ON_ONCE(scx_ops_init_task_enabled); scx_ops_init_task_enabled = true; @@ -5150,7 +5127,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. */ + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -5183,19 +5171,22 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* * All tasks are READY. It's safe to turn on scx_enabled() and switch * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - static_branch_enable_cpuslocked(&__scx_ops_enabled); + static_branch_enable(&__scx_ops_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ + percpu_down_write(&scx_fork_rwsem); spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -5213,10 +5204,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - - scx_cgroup_unlock(); - cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); /* @@ -5259,8 +5248,6 @@ err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); -err_disable_unlock_cpus: - cpus_read_unlock(); err_disable: mutex_unlock(&scx_ops_enable_mutex); /* must be fully disabled before returning */ -- cgit v1.2.3 From 95b873693a0841e02b812e693296a884362fdd51 Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Thu, 26 Sep 2024 18:39:49 +0800 Subject: sched_ext: Remove redundant p->nr_cpus_allowed checker select_rq_task() already checked that 'p->nr_cpus_allowed > 1', 'p->nr_cpus_allowed == 1' checker in scx_select_cpu_dfl() is redundant. Signed-off-by: Zhang Qiao Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 0c398ecdb938..3cd7c50a51c5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3074,22 +3074,13 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * there is an idle core elsewhere on the system. */ cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + if ((wake_flags & SCX_WAKE_SYNC) && !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0) { if (cpumask_test_cpu(cpu, p->cpus_ptr)) goto cpu_found; } - if (p->nr_cpus_allowed == 1) { - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } else { - return prev_cpu; - } - } - /* * If CPU has SMT, any wholly idle CPU is likely a better pick than * partially idle @prev_cpu. -- cgit v1.2.3 From cc9877fb76771b7cbce6c9ec239f13a1d7759876 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:33:37 -1000 Subject: sched_ext: Improve error reporting during loading When the BPF scheduler fails, ops.exit() allows rich error reporting through scx_exit_info. Use scx.exit() path consistently for all failures which can be caused by the BPF scheduler: - scx_ops_error() is called after ops.init() and ops.cgroup_init() failure to record error information. - ops.init_task() failure now uses scx_ops_error() instead of pr_err(). - The err_disable path updated to automatically trigger scx_ops_error() to cover cases that the error message hasn't already been generated and always return 0 indicating init success so that the error is reported through ops.exit(). Signed-off-by: Tejun Heo Cc: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..76f04f3ace61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -625,6 +625,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } @@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); - /* - * Returning an error code here would lose the recorded error - * information. Exit indicating success so that the error is notified - * through ops.exit() with all the details. - */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5241,10 +5241,18 @@ err_disable_unlock_all: scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } -- cgit v1.2.3 From ec010333ce7cf3270ae7193a6724794d5a179625 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:34:38 -1000 Subject: sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init() 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") assumed that scx_cgroup_exit() is only called after scx_cgroup_init() finished successfully. This isn't true. scx_cgroup_exit() can be called without scx_cgroup_init() being called at all or after scx_cgroup_init() failed in the middle. As init state is tracked per cgroup, scx_cgroup_exit() can be used safely to clean up in all cases. Remove the incorrect WARN_ON_ONCE(). Signed-off-by: Tejun Heo Fixes: 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 76f04f3ace61..1df2082d1352 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4052,7 +4052,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* -- cgit v1.2.3 From 9b671793c7d95f020791415cbbcc82b9c007d19c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:13 -1000 Subject: sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED scx_qmap and other schedulers in the SCX repo are using SCX_ENQ_WAKEUP to tell whether ops.select_cpu() was called. This is incorrect as ops.select_cpu() can be skipped in the wakeup path and leads to e.g. incorrectly skipping direct dispatch for tasks that are bound to a single CPU. sched core has been updated to specify ENQUEUE_RQ_SELECTED if ->select_task_rq() was called. Map it to SCX_ENQ_CPU_SELECTED and update scx_qmap to test it instead of SCX_ENQ_WAKEUP. Signed-off-by: Tejun Heo Acked-by: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 1 + tools/sched_ext/scx_qmap.bpf.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched/ext.c') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1df2082d1352..410a4df8a121 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -696,6 +696,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 67e2a7968cc9..5d1f880d1149 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_WAKEUP) && + /* if select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); -- cgit v1.2.3