From 7ccc215187a78f2e5260b9ca82505219bcbec899 Mon Sep 17 00:00:00 2001
From: Wenchao Hao <haowenchao22@gmail.com>
Date: Thu, 6 Jun 2024 16:52:16 +0800
Subject: workqueue: Clean code in alloc_and_link_pwqs()

wq->flags would not change, so it's not necessary to check if WQ_BH
is set in loop for_each_possible_cpu(), move define and set of pools
out of loop to simpliy the code.

Signed-off-by: Wenchao Hao <haowenchao22@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel/workqueue.c')
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 003474c9a77d..f70ac724ca7b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5461,16 +5461,17 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 		goto enomem;
 
 	if (!(wq->flags & WQ_UNBOUND)) {
+		struct worker_pool __percpu *pools;
+
+		if (wq->flags & WQ_BH)
+			pools = bh_worker_pools;
+		else
+			pools = cpu_worker_pools;
+
 		for_each_possible_cpu(cpu) {
 			struct pool_workqueue **pwq_p;
-			struct worker_pool __percpu *pools;
 			struct worker_pool *pool;
 
-			if (wq->flags & WQ_BH)
-				pools = bh_worker_pools;
-			else
-				pools = cpu_worker_pools;
-
 			pool = &(per_cpu_ptr(pools, cpu)[highpri]);
 			pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
 
-- 
cgit v1.2.3


From 37c2277fad7e90e2ca1202c9cdd13329ac91ecff Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sun, 9 Jun 2024 10:27:24 +0200
Subject: workqueue: replace call_rcu by kfree_rcu for simple kmem_cache_free
 callback

Since SLOB was removed, it is not necessary to use call_rcu
when the callback only performs kmem_cache_free. Use
kfree_rcu() directly.

The changes were done using the following Coccinelle semantic patch.
This semantic patch is designed to ignore cases where the callback
function is used in another way.

// <smpl>
@r@
expression e;
local idexpression e2;
identifier cb,f;
position p;
@@

(
call_rcu(...,e2)
|
call_rcu(&e->f,cb@p)
)

@r1@
type T;
identifier x,r.cb;
@@

 cb(...) {
(
   kmem_cache_free(...);
|
   T x = ...;
   kmem_cache_free(...,x);
|
   T x;
   x = ...;
   kmem_cache_free(...,x);
)
 }

@s depends on r1@
position p != r.p;
identifier r.cb;
@@

 cb@p

@script:ocaml@
cb << r.cb;
p << s.p;
@@

Printf.eprintf "Other use of %s at %s:%d\n"
   cb (List.hd p).file (List.hd p).line

@depends on r1 && !s@
expression e;
identifier r.cb,f;
position r.p;
@@

- call_rcu(&e->f,cb@p)
+ kfree_rcu(e,f)

@r1a depends on !s@
type T;
identifier x,r.cb;
@@

- cb(...) {
(
-  kmem_cache_free(...);
|
-  T x = ...;
-  kmem_cache_free(...,x);
|
-  T x;
-  x = ...;
-  kmem_cache_free(...,x);
)
- }
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f70ac724ca7b..b06c88814dde 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5022,12 +5022,6 @@ fail:
 	return NULL;
 }
 
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
-	kmem_cache_free(pwq_cache,
-			container_of(rcu, struct pool_workqueue, rcu));
-}
-
 /*
  * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
  * refcnt and needs to be destroyed.
@@ -5073,7 +5067,7 @@ static void pwq_release_workfn(struct kthread_work *work)
 		raw_spin_unlock_irq(&nna->lock);
 	}
 
-	call_rcu(&pwq->rcu, rcu_free_pwq);
+	kfree_rcu(pwq, rcu);
 
 	/*
 	 * If we're the last pwq going away, @wq is already dead and no one
-- 
cgit v1.2.3


From b56c720718e97d2a1a0523e38fdc836adc188a2e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Wed, 19 Jun 2024 18:39:34 +0800
Subject: workqueue: Avoid nr_active manipulation in grabbing inactive items

Current try_to_grab_pending() activates the inactive item and
subsequently treats it as though it were a standard activated item.

This approach prevents duplicating handling logic for both active and
inactive items, yet the premature activation of an inactive item
triggers trace_workqueue_activate_work(), yielding an unintended user
space visible side effect.

And the unnecessary increment of the nr_active, which is not a simple
counter now, followed by a counteracted decrement, is inefficient and
complicates the code.

Just remove the nr_active manipulation code in grabbing inactive items.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 42 +++++++++---------------------------------
 1 file changed, 9 insertions(+), 33 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b06c88814dde..23e424bda7c2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1683,33 +1683,6 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
 	__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
 }
 
-/**
- * pwq_activate_work - Activate a work item if inactive
- * @pwq: pool_workqueue @work belongs to
- * @work: work item to activate
- *
- * Returns %true if activated. %false if already active.
- */
-static bool pwq_activate_work(struct pool_workqueue *pwq,
-			      struct work_struct *work)
-{
-	struct worker_pool *pool = pwq->pool;
-	struct wq_node_nr_active *nna;
-
-	lockdep_assert_held(&pool->lock);
-
-	if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
-		return false;
-
-	nna = wq_node_nr_active(pwq->wq, pool->node);
-	if (nna)
-		atomic_inc(&nna->nr);
-
-	pwq->nr_active++;
-	__pwq_activate_work(pwq, work);
-	return true;
-}
-
 static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
 {
 	int max = READ_ONCE(nna->max);
@@ -2116,7 +2089,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
 	 */
 	pwq = get_work_pwq(work);
 	if (pwq && pwq->pool == pool) {
-		unsigned long work_data;
+		unsigned long work_data = *work_data_bits(work);
 
 		debug_work_deactivate(work);
 
@@ -2125,13 +2098,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
 		 * pwq->inactive_works since a queued barrier can't be
 		 * canceled (see the comments in insert_wq_barrier()).
 		 *
-		 * An inactive work item cannot be grabbed directly because
+		 * An inactive work item cannot be deleted directly because
 		 * it might have linked barrier work items which, if left
 		 * on the inactive_works list, will confuse pwq->nr_active
-		 * management later on and cause stall.  Make sure the work
-		 * item is activated before grabbing.
+		 * management later on and cause stall.  Move the linked
+		 * barrier work items to the worklist when deleting the grabbed
+		 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
+		 * it doesn't participate in nr_active management in later
+		 * pwq_dec_nr_in_flight().
 		 */
-		pwq_activate_work(pwq, work);
+		if (work_data & WORK_STRUCT_INACTIVE)
+			move_linked_works(work, &pwq->pool->worklist, NULL);
 
 		list_del_init(&work->entry);
 
@@ -2139,7 +2116,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
 		 * work->data points to pwq iff queued. Let's point to pool. As
 		 * this destroys work->data needed by the next step, stash it.
 		 */
-		work_data = *work_data_bits(work);
 		set_work_pool_and_keep_pending(work, pool->id,
 					       pool_offq_flags(pool));
 
-- 
cgit v1.2.3


From 68f83057b913467a999e1bf9e0da6a119668f769 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Fri, 21 Jun 2024 15:32:22 +0800
Subject: workqueue: Reap workers via kthread_stop() and remove
 detach_completion

The code to kick off the destruction of workers is now in a process
context (idle_cull_fn()), so kthread_stop() can be used in the process
context to replace the work of pool->detach_completion.

The wakeup in wake_dying_workers() is unneeded after this change, but it
is harmless, jut keep it here until next patch renames wake_dying_workers()
rather than renaming it again and again.

Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 23e424bda7c2..9c5df3985435 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -216,7 +216,6 @@ struct worker_pool {
 	struct worker		*manager;	/* L: purely informational */
 	struct list_head	workers;	/* A: attached workers */
 	struct list_head        dying_workers;  /* A: workers about to die */
-	struct completion	*detach_completion; /* all workers detached */
 
 	struct ida		worker_ida;	/* worker IDs for task name */
 
@@ -2696,7 +2695,6 @@ static void worker_attach_to_pool(struct worker *worker,
 static void worker_detach_from_pool(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct completion *detach_completion = NULL;
 
 	/* there is one permanent BH worker per CPU which should never detach */
 	WARN_ON_ONCE(pool->flags & POOL_BH);
@@ -2707,15 +2705,10 @@ static void worker_detach_from_pool(struct worker *worker)
 	list_del(&worker->node);
 	worker->pool = NULL;
 
-	if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
-		detach_completion = pool->detach_completion;
 	mutex_unlock(&wq_pool_attach_mutex);
 
 	/* clear leftover flags without pool->lock after it is detached */
 	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
-
-	if (detach_completion)
-		complete(detach_completion);
 }
 
 /**
@@ -2816,10 +2809,9 @@ static void unbind_worker(struct worker *worker)
 
 static void wake_dying_workers(struct list_head *cull_list)
 {
-	struct worker *worker, *tmp;
+	struct worker *worker;
 
-	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
-		list_del_init(&worker->entry);
+	list_for_each_entry(worker, cull_list, entry) {
 		unbind_worker(worker);
 		/*
 		 * If the worker was somehow already running, then it had to be
@@ -2835,6 +2827,17 @@ static void wake_dying_workers(struct list_head *cull_list)
 	}
 }
 
+static void reap_dying_workers(struct list_head *cull_list)
+{
+	struct worker *worker, *tmp;
+
+	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+		list_del_init(&worker->entry);
+		kthread_stop_put(worker->task);
+		kfree(worker);
+	}
+}
+
 /**
  * set_worker_dying - Tag a worker for destruction
  * @worker: worker to be destroyed
@@ -2866,6 +2869,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
 
 	list_move(&worker->entry, list);
 	list_move(&worker->node, &pool->dying_workers);
+
+	/* get an extra task struct reference for later kthread_stop_put() */
+	get_task_struct(worker->task);
 }
 
 /**
@@ -2949,6 +2955,8 @@ static void idle_cull_fn(struct work_struct *work)
 	raw_spin_unlock_irq(&pool->lock);
 	wake_dying_workers(&cull_list);
 	mutex_unlock(&wq_pool_attach_mutex);
+
+	reap_dying_workers(&cull_list);
 }
 
 static void send_mayday(struct work_struct *work)
@@ -3330,7 +3338,6 @@ woke_up:
 		ida_free(&pool->worker_ida, worker->id);
 		worker_detach_from_pool(worker);
 		WARN_ON_ONCE(!list_empty(&worker->entry));
-		kfree(worker);
 		return 0;
 	}
 
@@ -4863,7 +4870,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
  */
 static void put_unbound_pool(struct worker_pool *pool)
 {
-	DECLARE_COMPLETION_ONSTACK(detach_completion);
 	struct worker *worker;
 	LIST_HEAD(cull_list);
 
@@ -4917,12 +4923,9 @@ static void put_unbound_pool(struct worker_pool *pool)
 
 	wake_dying_workers(&cull_list);
 
-	if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
-		pool->detach_completion = &detach_completion;
 	mutex_unlock(&wq_pool_attach_mutex);
 
-	if (pool->detach_completion)
-		wait_for_completion(pool->detach_completion);
+	reap_dying_workers(&cull_list);
 
 	/* shut down the timers */
 	del_timer_sync(&pool->idle_timer);
-- 
cgit v1.2.3


From f45b1c3c33373c8c29a95a5188165d6eb634823a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Fri, 21 Jun 2024 15:32:23 +0800
Subject: workqueue: Don't bind the rescuer in the last working cpu

So that when the rescuer is woken up next time, it will not interrupt
the last working cpu which might be busy on other crucial works but
have nothing to do with the rescuer's incoming works.

Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9c5df3985435..cd1895630145 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2684,6 +2684,17 @@ static void worker_attach_to_pool(struct worker *worker,
 	mutex_unlock(&wq_pool_attach_mutex);
 }
 
+static void unbind_worker(struct worker *worker)
+{
+	lockdep_assert_held(&wq_pool_attach_mutex);
+
+	kthread_set_per_cpu(worker->task, -1);
+	if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+	else
+		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
 /**
  * worker_detach_from_pool() - detach a worker from its pool
  * @worker: worker which is attached to its pool
@@ -2701,7 +2712,7 @@ static void worker_detach_from_pool(struct worker *worker)
 
 	mutex_lock(&wq_pool_attach_mutex);
 
-	kthread_set_per_cpu(worker->task, -1);
+	unbind_worker(worker);
 	list_del(&worker->node);
 	worker->pool = NULL;
 
@@ -2796,17 +2807,6 @@ fail:
 	return NULL;
 }
 
-static void unbind_worker(struct worker *worker)
-{
-	lockdep_assert_held(&wq_pool_attach_mutex);
-
-	kthread_set_per_cpu(worker->task, -1);
-	if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
-		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
-	else
-		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
-}
-
 static void wake_dying_workers(struct list_head *cull_list)
 {
 	struct worker *worker;
-- 
cgit v1.2.3


From f4b7b53c94afdec6dd0f1f834cfcc40595ddc916 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Fri, 21 Jun 2024 15:32:24 +0800
Subject: workqueue: Detach workers directly in idle_cull_fn()

The code to kick off the destruction of workers is now in a process
context (idle_cull_fn()), and the detaching of a worker is not required
to be inside the worker thread now, so just do the detaching directly
in idle_cull_fn().

wake_dying_workers() is renamed to detach_dying_workers() and the unneeded
wakeup in wake_dying_workers() is also removed.

Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cd1895630145..04168972814b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2695,6 +2695,16 @@ static void unbind_worker(struct worker *worker)
 		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
 }
 
+
+static void detach_worker(struct worker *worker)
+{
+	lockdep_assert_held(&wq_pool_attach_mutex);
+
+	unbind_worker(worker);
+	list_del(&worker->node);
+	worker->pool = NULL;
+}
+
 /**
  * worker_detach_from_pool() - detach a worker from its pool
  * @worker: worker which is attached to its pool
@@ -2711,11 +2721,7 @@ static void worker_detach_from_pool(struct worker *worker)
 	WARN_ON_ONCE(pool->flags & POOL_BH);
 
 	mutex_lock(&wq_pool_attach_mutex);
-
-	unbind_worker(worker);
-	list_del(&worker->node);
-	worker->pool = NULL;
-
+	detach_worker(worker);
 	mutex_unlock(&wq_pool_attach_mutex);
 
 	/* clear leftover flags without pool->lock after it is detached */
@@ -2807,24 +2813,12 @@ fail:
 	return NULL;
 }
 
-static void wake_dying_workers(struct list_head *cull_list)
+static void detach_dying_workers(struct list_head *cull_list)
 {
 	struct worker *worker;
 
-	list_for_each_entry(worker, cull_list, entry) {
-		unbind_worker(worker);
-		/*
-		 * If the worker was somehow already running, then it had to be
-		 * in pool->idle_list when set_worker_dying() happened or we
-		 * wouldn't have gotten here.
-		 *
-		 * Thus, the worker must either have observed the WORKER_DIE
-		 * flag, or have set its state to TASK_IDLE. Either way, the
-		 * below will be observed by the worker and is safe to do
-		 * outside of pool->lock.
-		 */
-		wake_up_process(worker->task);
-	}
+	list_for_each_entry(worker, cull_list, entry)
+		detach_worker(worker);
 }
 
 static void reap_dying_workers(struct list_head *cull_list)
@@ -2930,9 +2924,9 @@ static void idle_cull_fn(struct work_struct *work)
 
 	/*
 	 * Grabbing wq_pool_attach_mutex here ensures an already-running worker
-	 * cannot proceed beyong worker_detach_from_pool() in its self-destruct
-	 * path. This is required as a previously-preempted worker could run after
-	 * set_worker_dying() has happened but before wake_dying_workers() did.
+	 * cannot proceed beyong set_pf_worker() in its self-destruct path.
+	 * This is required as a previously-preempted worker could run after
+	 * set_worker_dying() has happened but before detach_dying_workers() did.
 	 */
 	mutex_lock(&wq_pool_attach_mutex);
 	raw_spin_lock_irq(&pool->lock);
@@ -2953,7 +2947,7 @@ static void idle_cull_fn(struct work_struct *work)
 	}
 
 	raw_spin_unlock_irq(&pool->lock);
-	wake_dying_workers(&cull_list);
+	detach_dying_workers(&cull_list);
 	mutex_unlock(&wq_pool_attach_mutex);
 
 	reap_dying_workers(&cull_list);
@@ -3336,7 +3330,6 @@ woke_up:
 
 		set_task_comm(worker->task, "kworker/dying");
 		ida_free(&pool->worker_ida, worker->id);
-		worker_detach_from_pool(worker);
 		WARN_ON_ONCE(!list_empty(&worker->entry));
 		return 0;
 	}
@@ -4921,7 +4914,7 @@ static void put_unbound_pool(struct worker_pool *pool)
 	WARN_ON(pool->nr_workers || pool->nr_idle);
 	raw_spin_unlock_irq(&pool->lock);
 
-	wake_dying_workers(&cull_list);
+	detach_dying_workers(&cull_list);
 
 	mutex_unlock(&wq_pool_attach_mutex);
 
-- 
cgit v1.2.3


From a071b043ab13ae1f5d12ba6f267936feb800dff8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Fri, 21 Jun 2024 15:32:25 +0800
Subject: workqueue: Remove useless pool->dying_workers

A dying worker is first moved from pool->workers to pool->dying_workers
in set_worker_dying() and removed from pool->dying_workers in
detach_dying_workers().  The whole procedure is in the some lock context
of wq_pool_attach_mutex.

So pool->dying_workers is useless, just remove it and keep the dying
worker in pool->workers after set_worker_dying() and remove it in
detach_dying_workers() with wq_pool_attach_mutex held.

Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 04168972814b..adf1893b161e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -215,7 +215,6 @@ struct worker_pool {
 
 	struct worker		*manager;	/* L: purely informational */
 	struct list_head	workers;	/* A: attached workers */
-	struct list_head        dying_workers;  /* A: workers about to die */
 
 	struct ida		worker_ida;	/* worker IDs for task name */
 
@@ -2862,7 +2861,6 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
 	worker->flags |= WORKER_DIE;
 
 	list_move(&worker->entry, list);
-	list_move(&worker->node, &pool->dying_workers);
 
 	/* get an extra task struct reference for later kthread_stop_put() */
 	get_task_struct(worker->task);
@@ -4721,7 +4719,6 @@ static int init_worker_pool(struct worker_pool *pool)
 	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
 	INIT_LIST_HEAD(&pool->workers);
-	INIT_LIST_HEAD(&pool->dying_workers);
 
 	ida_init(&pool->worker_ida);
 	INIT_HLIST_NODE(&pool->hash_node);
-- 
cgit v1.2.3


From 18e24deb1cc92f2068ce7434a94233741fbd7771 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 25 Jun 2024 21:42:44 +1000
Subject: workqueue: wq_watchdog_touch is always called with valid CPU

Warn in the case it is called with cpu == -1. This does not appear
to happen anywhere.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index adf1893b161e..b032772bd144 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7526,6 +7526,8 @@ notrace void wq_watchdog_touch(int cpu)
 {
 	if (cpu >= 0)
 		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+	else
+		WARN_ONCE(1, "%s should be called with valid CPU", __func__);
 
 	wq_watchdog_touched = jiffies;
 }
-- 
cgit v1.2.3


From 98f887f820c993e05a12e8aa816c80b8661d4c87 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 25 Jun 2024 21:42:45 +1000
Subject: workqueue: Improve scalability of workqueue watchdog touch

On a ~2000 CPU powerpc system, hard lockups have been observed in the
workqueue code when stop_machine runs (in this case due to CPU hotplug).
This is due to lots of CPUs spinning in multi_cpu_stop, calling
touch_nmi_watchdog() which ends up calling wq_watchdog_touch().
wq_watchdog_touch() writes to the global variable wq_watchdog_touched,
and that can find itself in the same cacheline as other important
workqueue data, which slows down operations to the point of lockups.

In the case of the following abridged trace, worker_pool_idr was in
the hot line, causing the lockups to always appear at idr_find.

  watchdog: CPU 1125 self-detected hard LOCKUP @ idr_find
  Call Trace:
  get_work_pool
  __queue_work
  call_timer_fn
  run_timer_softirq
  __do_softirq
  do_softirq_own_stack
  irq_exit
  timer_interrupt
  decrementer_common_virt
  * interrupt: 900 (timer) at multi_cpu_stop
  multi_cpu_stop
  cpu_stopper_thread
  smpboot_thread_fn
  kthread

Fix this by having wq_watchdog_touch() only write to the line if the
last time a touch was recorded exceeds 1/4 of the watchdog threshold.

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b032772bd144..e271d02f3d8c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7524,12 +7524,18 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
 
 notrace void wq_watchdog_touch(int cpu)
 {
+	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+	unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
+	unsigned long now = jiffies;
+
 	if (cpu >= 0)
-		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+		per_cpu(wq_watchdog_touched_cpu, cpu) = now;
 	else
 		WARN_ONCE(1, "%s should be called with valid CPU", __func__);
 
-	wq_watchdog_touched = jiffies;
+	/* Don't unnecessarily store to global cacheline */
+	if (time_after(now, touch_ts + thresh / 4))
+		WRITE_ONCE(wq_watchdog_touched, jiffies);
 }
 
 static void wq_watchdog_set_thresh(unsigned long thresh)
-- 
cgit v1.2.3


From 841658832335a32dd86f4e4d3aab7d14188b268b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Tue, 2 Jul 2024 12:14:55 +0800
Subject: workqueue: Update cpumasks after only applying it successfully

Make workqueue_unbound_exclude_cpumask() and workqueue_set_unbound_cpumask()
only update wq_isolated_cpumask and wq_requested_unbound_cpumask when
workqueue_apply_unbound_cpumask() returns successfully.

Fixes: fe28f631fa94("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask")
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e271d02f3d8c..6adee950077a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6848,9 +6848,6 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
 	lockdep_assert_cpus_held();
 	mutex_lock(&wq_pool_mutex);
 
-	/* Save the current isolated cpumask & export it via sysfs */
-	cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
-
 	/*
 	 * If the operation fails, it will fall back to
 	 * wq_requested_unbound_cpumask which is initially set to
@@ -6862,6 +6859,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
 	if (!cpumask_equal(cpumask, wq_unbound_cpumask))
 		ret = workqueue_apply_unbound_cpumask(cpumask);
 
+	/* Save the current isolated cpumask & export it via sysfs */
+	if (!ret)
+		cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
 	mutex_unlock(&wq_pool_mutex);
 	free_cpumask_var(cpumask);
 	return ret;
@@ -7197,7 +7198,6 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
-		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
 		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
 			ret = 0;
 			goto out_unlock;
@@ -7206,6 +7206,8 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 		ret = workqueue_apply_unbound_cpumask(cpumask);
 
 out_unlock:
+		if (!ret)
+			cpumask_copy(wq_requested_unbound_cpumask, cpumask);
 		apply_wqattrs_unlock();
 	}
 
-- 
cgit v1.2.3


From b3d209164dc0aca115057b00d6b466793a747c87 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Tue, 2 Jul 2024 12:14:56 +0800
Subject: workqueue: Simplify goto statement

Use a simple if-statement to replace the cumbersome goto-statement in
workqueue_set_unbound_cpumask().

Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6adee950077a..32ac6f1e94a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7197,15 +7197,10 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
+		ret = 0;
 		apply_wqattrs_lock();
-		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
-			ret = 0;
-			goto out_unlock;
-		}
-
-		ret = workqueue_apply_unbound_cpumask(cpumask);
-
-out_unlock:
+		if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+			ret = workqueue_apply_unbound_cpumask(cpumask);
 		if (!ret)
 			cpumask_copy(wq_requested_unbound_cpumask, cpumask);
 		apply_wqattrs_unlock();
-- 
cgit v1.2.3


From c3138f3881920d1391e435aa4144b929d5237617 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 4 Jul 2024 11:49:10 +0800
Subject: workqueue: Register sysfs after the whole creation of the new wq

workqueue creation includes adding it to the workqueue list.

Prepare for moving the whole workqueue initializing procedure into
wq_pool_mutex and cpu hotplug locks.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 32ac6f1e94a8..5cb1dd0a49fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5684,9 +5684,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	if (wq_online && init_rescuer(wq) < 0)
 		goto err_destroy;
 
-	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
-		goto err_destroy;
-
 	/*
 	 * wq_pool_mutex protects global freeze state and workqueues list.
 	 * Grab it, adjust max_active and add the new @wq to workqueues
@@ -5702,6 +5699,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	mutex_unlock(&wq_pool_mutex);
 
+	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+		goto err_destroy;
+
 	return wq;
 
 err_free_node_nr_active:
-- 
cgit v1.2.3


From c5178e6ca6c8063edc103b75f410add7e4565e63 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 4 Jul 2024 11:49:11 +0800
Subject: workqueue: Make rescuer initialization as the last step of the
 creation of a new wq

For early wq allocation, rescuer initialization is the last step of the
creation of a new wq.  Make the behavior the same for all allocations.

Prepare for initializing rescuer's affinities with the default pwq's
affinities.

Prepare for moving the whole workqueue initializing procedure into
wq_pool_mutex and cpu hotplug locks.

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5cb1dd0a49fb..af00e63182d0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5681,9 +5681,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	if (alloc_and_link_pwqs(wq) < 0)
 		goto err_free_node_nr_active;
 
-	if (wq_online && init_rescuer(wq) < 0)
-		goto err_destroy;
-
 	/*
 	 * wq_pool_mutex protects global freeze state and workqueues list.
 	 * Grab it, adjust max_active and add the new @wq to workqueues
@@ -5699,6 +5696,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	mutex_unlock(&wq_pool_mutex);
 
+	if (wq_online && init_rescuer(wq) < 0)
+		goto err_destroy;
+
 	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
 		goto err_destroy;
 
-- 
cgit v1.2.3


From 4e9a37389ec2d062e02f6647d1d60c3d11150896 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 4 Jul 2024 11:49:12 +0800
Subject: workqueue: Move kthread_flush_worker() out of alloc_and_link_pwqs()

kthread_flush_worker() can't be called with wq_pool_mutex held.

Prepare for moving wq_pool_mutex and cpu hotplug lock out of
alloc_and_link_pwqs().

Cc: Zqiang <qiang.zhang1211@gmail.com>
Link: https://lore.kernel.org/lkml/20230920060704.24981-1-qiang.zhang1211@gmail.com/
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index af00e63182d0..ab0aca0ccc8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5467,12 +5467,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	}
 	cpus_read_unlock();
 
-	/* for unbound pwq, flush the pwq_release_worker ensures that the
-	 * pwq_release_workfn() completes before calling kfree(wq).
-	 */
-	if (ret)
-		kthread_flush_worker(pwq_release_worker);
-
 	return ret;
 
 enomem:
@@ -5705,8 +5699,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	return wq;
 
 err_free_node_nr_active:
-	if (wq->flags & WQ_UNBOUND)
+	/*
+	 * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
+	 * flushing the pwq_release_worker ensures that the pwq_release_workfn()
+	 * completes before calling kfree(wq).
+	 */
+	if (wq->flags & WQ_UNBOUND) {
+		kthread_flush_worker(pwq_release_worker);
 		free_node_nr_active(wq->node_nr_active);
+	}
 err_unreg_lockdep:
 	wq_unregister_lockdep(wq);
 	wq_free_lockdep(wq);
-- 
cgit v1.2.3


From 1726a17135905e2d2773f18d47bd4e17dd26e1ed Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 4 Jul 2024 11:49:13 +0800
Subject: workqueue: Put PWQ allocation and WQ enlistment in the same lock C.S.

The PWQ allocation and WQ enlistment are not within the same lock-held
critical section; therefore, their states can become out of sync when
the user modifies the unbound mask or if CPU hotplug events occur in
the interim since those operations only update the WQs that are already
in the list.

Make the PWQ allocation and WQ enlistment atomic.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 54 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ab0aca0ccc8f..0c06ffd0174e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5108,6 +5108,19 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
+static void apply_wqattrs_lock(void)
+{
+	/* CPUs should stay stable across pwq creations and installations */
+	cpus_read_lock();
+	mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+	mutex_unlock(&wq_pool_mutex);
+	cpus_read_unlock();
+}
+
 /**
  * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
  * @attrs: the wq_attrs of the default pwq of the target workqueue
@@ -5419,6 +5432,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	bool highpri = wq->flags & WQ_HIGHPRI;
 	int cpu, ret;
 
+	lockdep_assert_cpus_held();
+	lockdep_assert_held(&wq_pool_mutex);
+
 	wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
 	if (!wq->cpu_pwq)
 		goto enomem;
@@ -5452,20 +5468,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 		return 0;
 	}
 
-	cpus_read_lock();
 	if (wq->flags & __WQ_ORDERED) {
 		struct pool_workqueue *dfl_pwq;
 
-		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
 		/* there should only be single pwq for ordering guarantee */
 		dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
 		WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
 			      wq->pwqs.prev != &dfl_pwq->pwqs_node),
 		     "ordering guarantee broken for workqueue %s\n", wq->name);
 	} else {
-		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
 	}
-	cpus_read_unlock();
 
 	return ret;
 
@@ -5672,15 +5686,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 			goto err_unreg_lockdep;
 	}
 
-	if (alloc_and_link_pwqs(wq) < 0)
-		goto err_free_node_nr_active;
-
 	/*
-	 * wq_pool_mutex protects global freeze state and workqueues list.
-	 * Grab it, adjust max_active and add the new @wq to workqueues
-	 * list.
+	 * wq_pool_mutex protects the workqueues list, allocations of PWQs,
+	 * and the global freeze state.  alloc_and_link_pwqs() also requires
+	 * cpus_read_lock() for PWQs' affinities.
 	 */
-	mutex_lock(&wq_pool_mutex);
+	apply_wqattrs_lock();
+
+	if (alloc_and_link_pwqs(wq) < 0)
+		goto err_unlock_free_node_nr_active;
 
 	mutex_lock(&wq->mutex);
 	wq_adjust_max_active(wq);
@@ -5688,7 +5702,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	list_add_tail_rcu(&wq->list, &workqueues);
 
-	mutex_unlock(&wq_pool_mutex);
+	apply_wqattrs_unlock();
 
 	if (wq_online && init_rescuer(wq) < 0)
 		goto err_destroy;
@@ -5698,7 +5712,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	return wq;
 
-err_free_node_nr_active:
+err_unlock_free_node_nr_active:
+	apply_wqattrs_unlock();
 	/*
 	 * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
 	 * flushing the pwq_release_worker ensures that the pwq_release_workfn()
@@ -6987,19 +7002,6 @@ static struct attribute *wq_sysfs_attrs[] = {
 };
 ATTRIBUTE_GROUPS(wq_sysfs);
 
-static void apply_wqattrs_lock(void)
-{
-	/* CPUs should stay stable across pwq creations and installations */
-	cpus_read_lock();
-	mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
-	mutex_unlock(&wq_pool_mutex);
-	cpus_read_unlock();
-}
-
 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-- 
cgit v1.2.3


From 449b31ad2937406981685348ad75abefb1f63cba Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 4 Jul 2024 11:49:14 +0800
Subject: workqueue: Init rescuer's affinities as the wq's effective cpumask

Make it consistent with apply_wqattrs_commit().

Link: https://lore.kernel.org/lkml/20240203154334.791910-5-longman@redhat.com/
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0c06ffd0174e..4337910cc034 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5516,6 +5516,8 @@ static int init_rescuer(struct workqueue_struct *wq)
 	struct worker *rescuer;
 	int ret;
 
+	lockdep_assert_held(&wq_pool_mutex);
+
 	if (!(wq->flags & WQ_MEM_RECLAIM))
 		return 0;
 
@@ -5538,7 +5540,7 @@ static int init_rescuer(struct workqueue_struct *wq)
 
 	wq->rescuer = rescuer;
 	if (wq->flags & WQ_UNBOUND)
-		kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
+		kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
 	else
 		kthread_bind_mask(rescuer->task, cpu_possible_mask);
 	wake_up_process(rescuer->task);
@@ -5702,10 +5704,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	list_add_tail_rcu(&wq->list, &workqueues);
 
-	apply_wqattrs_unlock();
-
 	if (wq_online && init_rescuer(wq) < 0)
-		goto err_destroy;
+		goto err_unlock_destroy;
+
+	apply_wqattrs_unlock();
 
 	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
 		goto err_destroy;
@@ -5730,6 +5732,8 @@ err_free_wq:
 	free_workqueue_attrs(wq->unbound_attrs);
 	kfree(wq);
 	return NULL;
+err_unlock_destroy:
+	apply_wqattrs_unlock();
 err_destroy:
 	destroy_workqueue(wq);
 	return NULL;
-- 
cgit v1.2.3


From 8d84baf76045f5b9567581cde17e9c3d256bb073 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:41 +0800
Subject: workqueue: Add wq_online_cpumask

The new wq_online_mask mirrors the cpu_online_mask except during
hotplugging; specifically, it differs between the hotplugging stages
of workqueue_offline_cpu() and workqueue_online_cpu(), during which
the transitioning CPU is not represented in the mask.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4337910cc034..e49afcb3d90a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -444,6 +444,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
+static cpumask_var_t wq_online_cpumask;
+
 /* PL&A: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
@@ -6574,6 +6577,8 @@ int workqueue_online_cpu(unsigned int cpu)
 
 	mutex_lock(&wq_pool_mutex);
 
+	cpumask_set_cpu(cpu, wq_online_cpumask);
+
 	for_each_pool(pool, pi) {
 		/* BH pools aren't affected by hotplug */
 		if (pool->flags & POOL_BH)
@@ -6620,6 +6625,9 @@ int workqueue_offline_cpu(unsigned int cpu)
 
 	/* update pod affinity of unbound workqueues */
 	mutex_lock(&wq_pool_mutex);
+
+	cpumask_clear_cpu(cpu, wq_online_cpumask);
+
 	list_for_each_entry(wq, &workqueues, list) {
 		struct workqueue_attrs *attrs = wq->unbound_attrs;
 
@@ -7649,10 +7657,12 @@ void __init workqueue_init_early(void)
 
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
 
+	cpumask_copy(wq_online_cpumask, cpu_online_mask);
 	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
 	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
 	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
-- 
cgit v1.2.3


From fbb3d4c15dc0fe8a439de267db927a9ab2f44e98 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:42 +0800
Subject: workqueue: Simplify wq_calc_pod_cpumask() with wq_online_cpumask

Avoid relying on cpu_online_mask for wqattrs changes so that
cpus_read_lock() can be removed from apply_wqattrs_lock().

And with wq_online_cpumask, attrs->__pod_cpumask doesn't need to be
reused as a temporary storage to calculate if the pod have any online
CPUs @attrs wants since @cpu_going_down is not in the wq_online_cpumask.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e49afcb3d90a..a345a72395e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5146,20 +5146,14 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
 	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
 	int pod = pt->cpu_pod[cpu];
 
-	/* does @pod have any online CPUs @attrs wants? */
+	/* calculate possible CPUs in @pod that @attrs wants */
 	cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
-	cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
-	if (cpu_going_down >= 0)
-		cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
-
-	if (cpumask_empty(attrs->__pod_cpumask)) {
+	/* does @pod have any online CPUs @attrs wants? */
+	if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
 		cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
 		return;
 	}
 
-	/* yeap, return possible CPUs in @pod that @attrs wants */
-	cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
-
 	if (cpumask_empty(attrs->__pod_cpumask))
 		pr_warn_once("WARNING: workqueue cpumask: online intersect > "
 				"possible intersect\n");
-- 
cgit v1.2.3


From 19af457573838785290d27dd09a59140f541d1d8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:43 +0800
Subject: workqueue: Remove cpus_read_lock() from apply_wqattrs_lock()

1726a1713590 ("workqueue: Put PWQ allocation and WQ enlistment in the same
lock C.S.") led to the following possible deadlock:

  WARNING: possible recursive locking detected
  6.10.0-rc5-00004-g1d4c6111406c #1 Not tainted
   --------------------------------------------
   swapper/0/1 is trying to acquire lock:
   c27760f4 (cpu_hotplug_lock){++++}-{0:0}, at: alloc_workqueue (kernel/workqueue.c:5152 kernel/workqueue.c:5730)

   but task is already holding lock:
   c27760f4 (cpu_hotplug_lock){++++}-{0:0}, at: padata_alloc (kernel/padata.c:1007)
   ...
   stack backtrace:
   ...
   cpus_read_lock (include/linux/percpu-rwsem.h:53 kernel/cpu.c:488)
   alloc_workqueue (kernel/workqueue.c:5152 kernel/workqueue.c:5730)
   padata_alloc (kernel/padata.c:1007 (discriminator 1))
   pcrypt_init_padata (crypto/pcrypt.c:327 (discriminator 1))
   pcrypt_init (crypto/pcrypt.c:353)
   do_one_initcall (init/main.c:1267)
   do_initcalls (init/main.c:1328 (discriminator 1) init/main.c:1345 (discriminator 1))
   kernel_init_freeable (init/main.c:1364)
   kernel_init (init/main.c:1469)
   ret_from_fork (arch/x86/kernel/process.c:153)
   ret_from_fork_asm (arch/x86/entry/entry_32.S:737)
   entry_INT80_32 (arch/x86/entry/entry_32.S:944)

This is caused by pcrypt allocating a workqueue while holding
cpus_read_lock(), so workqueue code can't do it again as that can lead to
deadlocks if down_write starts after the first down_read.

The pwq creations and installations have been reworked based on
wq_online_cpumask rather than cpu_online_mask making cpus_read_lock() is
unneeded during wqattrs changes. Fix the deadlock by removing
cpus_read_lock() from apply_wqattrs_lock().

tj: Updated changelog.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Fixes: 1726a1713590 ("workqueue: Put PWQ allocation and WQ enlistment in the same lock C.S.")
Link: http://lkml.kernel.org/r/202407081521.83b627c1-lkp@intel.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a345a72395e7..9b7c1fcd934b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5113,15 +5113,12 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 
 static void apply_wqattrs_lock(void)
 {
-	/* CPUs should stay stable across pwq creations and installations */
-	cpus_read_lock();
 	mutex_lock(&wq_pool_mutex);
 }
 
 static void apply_wqattrs_unlock(void)
 {
 	mutex_unlock(&wq_pool_mutex);
-	cpus_read_unlock();
 }
 
 /**
-- 
cgit v1.2.3


From 2cb61f76be3b17d5ad42ba3b7b23c7bf98253a0b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:44 +0800
Subject: workqueue: Remove the unneeded cpumask empty check in
 wq_calc_pod_cpumask()

The cpumask empty check in wq_calc_pod_cpumask() has long been useless.
It just works purely as documents which states that the cpumask is not
possible empty after the function returns.

Now the code above is even more explicit that the cpumask is not empty,
so the document-only empty check can be removed.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9b7c1fcd934b..298ce6b4989d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5150,10 +5150,6 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
 		cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
 		return;
 	}
-
-	if (cpumask_empty(attrs->__pod_cpumask))
-		pr_warn_once("WARNING: workqueue cpumask: online intersect > "
-				"possible intersect\n");
 }
 
 /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
-- 
cgit v1.2.3


From 88a41b185d3d6bb03733441c4e440c80dbd1866a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:45 +0800
Subject: workqueue: Remove the argument @cpu_going_down from
 wq_calc_pod_cpumask()

wq_calc_pod_cpumask() uses wq_online_cpumask, which excludes the cpu
going down, so the argument cpu_going_down is unused and can be removed.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 298ce6b4989d..42190bdeacbd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5125,10 +5125,8 @@ static void apply_wqattrs_unlock(void)
  * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @cpu: the target CPU
- * @cpu_going_down: if >= 0, the CPU to consider as offline
  *
- * Calculate the cpumask a workqueue with @attrs should use on @pod. If
- * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod.
  * The result is stored in @attrs->__pod_cpumask.
  *
  * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
@@ -5137,8 +5135,7 @@ static void apply_wqattrs_unlock(void)
  *
  * The caller is responsible for ensuring that the cpumask of @pod stays stable.
  */
-static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
-				int cpu_going_down)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
 {
 	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
 	int pod = pt->cpu_pod[cpu];
@@ -5234,7 +5231,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 			ctx->dfl_pwq->refcnt++;
 			ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
 		} else {
-			wq_calc_pod_cpumask(new_attrs, cpu, -1);
+			wq_calc_pod_cpumask(new_attrs, cpu);
 			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
 			if (!ctx->pwq_tbl[cpu])
 				goto out_free;
@@ -5368,7 +5365,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 			  int hotplug_cpu, bool online)
 {
-	int off_cpu = online ? -1 : hotplug_cpu;
 	struct pool_workqueue *old_pwq = NULL, *pwq;
 	struct workqueue_attrs *target_attrs;
 
@@ -5388,7 +5384,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 	wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
 
 	/* nothing to do if the target cpumask matches the current pwq */
-	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+	wq_calc_pod_cpumask(target_attrs, cpu);
 	if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
 		return;
 
-- 
cgit v1.2.3


From d160a58de59cd617d9648d0458d572d063d66b71 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:46 +0800
Subject: workqueue: Remove the arguments @hotplug_cpu and @online from
 wq_update_pod()

The arguments @hotplug_cpu and @online are not used in wq_update_pod()
since the functions called by wq_update_pod() don't need them.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42190bdeacbd..d9e0006c0be7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5344,8 +5344,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
  * @wq: the target workqueue
  * @cpu: the CPU to update pool association for
- * @hotplug_cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
  * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update pod affinity of
@@ -5362,8 +5360,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
  * responsibility to flush the work item from CPU_DOWN_PREPARE.
  */
-static void wq_update_pod(struct workqueue_struct *wq, int cpu,
-			  int hotplug_cpu, bool online)
+static void wq_update_pod(struct workqueue_struct *wq, int cpu)
 {
 	struct pool_workqueue *old_pwq = NULL, *pwq;
 	struct workqueue_attrs *target_attrs;
@@ -6584,7 +6581,7 @@ int workqueue_online_cpu(unsigned int cpu)
 			int tcpu;
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu, cpu, true);
+				wq_update_pod(wq, tcpu);
 
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, -1);
@@ -6619,7 +6616,7 @@ int workqueue_offline_cpu(unsigned int cpu)
 			int tcpu;
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu, cpu, false);
+				wq_update_pod(wq, tcpu);
 
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, cpu);
@@ -6908,7 +6905,7 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
 
 	list_for_each_entry(wq, &workqueues, list) {
 		for_each_online_cpu(cpu) {
-			wq_update_pod(wq, cpu, cpu, true);
+			wq_update_pod(wq, cpu);
 		}
 	}
 
@@ -7927,7 +7924,7 @@ void __init workqueue_init_topology(void)
 	 */
 	list_for_each_entry(wq, &workqueues, list) {
 		for_each_online_cpu(cpu)
-			wq_update_pod(wq, cpu, cpu, true);
+			wq_update_pod(wq, cpu);
 		if (wq->flags & WQ_UNBOUND) {
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, -1);
-- 
cgit v1.2.3


From b2b1f9338400de0fb65e2ff5fab1b5617dcb5a97 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Thu, 11 Jul 2024 16:35:47 +0800
Subject: workqueue: Rename wq_update_pod() to unbound_wq_update_pwq()

What wq_update_pod() does is just to update the pwq of the specific
cpu.  Rename it and update the comments.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9e0006c0be7..79a178500a77 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,7 +433,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
 static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
 
 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_pod_attrs_buf;
+static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -5341,13 +5341,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 }
 
 /**
- * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
+ * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
  * @wq: the target workqueue
- * @cpu: the CPU to update pool association for
+ * @cpu: the CPU to update the pwq slot for
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update pod affinity of
- * @wq accordingly.
+ * %CPU_DOWN_FAILED.  @cpu is in the same pod of the CPU being hot[un]plugged.
  *
  *
  * If pod affinity can't be adjusted due to memory allocation failure, it falls
@@ -5360,7 +5359,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
  * responsibility to flush the work item from CPU_DOWN_PREPARE.
  */
-static void wq_update_pod(struct workqueue_struct *wq, int cpu)
+static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
 {
 	struct pool_workqueue *old_pwq = NULL, *pwq;
 	struct workqueue_attrs *target_attrs;
@@ -5375,7 +5374,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu)
 	 * Let's use a preallocated one.  The following buf is protected by
 	 * CPU hotplug exclusion.
 	 */
-	target_attrs = wq_update_pod_attrs_buf;
+	target_attrs = unbound_wq_update_pwq_attrs_buf;
 
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
@@ -6581,7 +6580,7 @@ int workqueue_online_cpu(unsigned int cpu)
 			int tcpu;
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu);
+				unbound_wq_update_pwq(wq, tcpu);
 
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, -1);
@@ -6616,7 +6615,7 @@ int workqueue_offline_cpu(unsigned int cpu)
 			int tcpu;
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu);
+				unbound_wq_update_pwq(wq, tcpu);
 
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, cpu);
@@ -6904,9 +6903,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
 	wq_affn_dfl = affn;
 
 	list_for_each_entry(wq, &workqueues, list) {
-		for_each_online_cpu(cpu) {
-			wq_update_pod(wq, cpu);
-		}
+		for_each_online_cpu(cpu)
+			unbound_wq_update_pwq(wq, cpu);
 	}
 
 	mutex_unlock(&wq_pool_mutex);
@@ -7653,8 +7651,8 @@ void __init workqueue_init_early(void)
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-	wq_update_pod_attrs_buf = alloc_workqueue_attrs();
-	BUG_ON(!wq_update_pod_attrs_buf);
+	unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
+	BUG_ON(!unbound_wq_update_pwq_attrs_buf);
 
 	/*
 	 * If nohz_full is enabled, set power efficient workqueue as unbound.
@@ -7919,12 +7917,12 @@ void __init workqueue_init_topology(void)
 
 	/*
 	 * Workqueues allocated earlier would have all CPUs sharing the default
-	 * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
-	 * combinations to apply per-pod sharing.
+	 * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
+	 * and CPU combinations to apply per-pod sharing.
 	 */
 	list_for_each_entry(wq, &workqueues, list) {
 		for_each_online_cpu(cpu)
-			wq_update_pod(wq, cpu);
+			unbound_wq_update_pwq(wq, cpu);
 		if (wq->flags & WQ_UNBOUND) {
 			mutex_lock(&wq->mutex);
 			wq_update_node_max_active(wq, -1);
-- 
cgit v1.2.3


From 58629d4871e8eb2c385b16a73a8451669db59f39 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Wed, 3 Jul 2024 17:27:41 +0800
Subject: workqueue: Always queue work items to the newest PWQ for order
 workqueues

To ensure non-reentrancy, __queue_work() attempts to enqueue a work
item to the pool of the currently executing worker. This is not only
unnecessary for an ordered workqueue, where order inherently suggests
non-reentrancy, but it could also disrupt the sequence if the item is
not enqueued on the newest PWQ.

Just queue it to the newest PWQ and let order management guarantees
non-reentrancy.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Fixes: 4c065dbce1e8 ("workqueue: Enable unbound cpumask update on ordered workqueues")
Cc: stable@vger.kernel.org # v6.9+
Signed-off-by: Tejun Heo <tj@kernel.org>
(cherry picked from commit 74347be3edfd11277799242766edf844c43dd5d3)
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 79a178500a77..e3ab09e70ba9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2274,9 +2274,13 @@ retry:
 	 * If @work was previously on a different pool, it might still be
 	 * running there, in which case the work needs to be queued on that
 	 * pool to guarantee non-reentrancy.
+	 *
+	 * For ordered workqueue, work items must be queued on the newest pwq
+	 * for accurate order management.  Guaranteed order also guarantees
+	 * non-reentrancy.  See the comments above unplug_oldest_pwq().
 	 */
 	last_pool = get_work_pool(work);
-	if (last_pool && last_pool != pool) {
+	if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
 		struct worker *worker;
 
 		raw_spin_lock(&last_pool->lock);
-- 
cgit v1.2.3